/*******************************************************************************
*                                                                              *
*   (C) 1997-2021 by Ernst W. Mayer.                                           *
*                                                                              *
*  This program is free software; you can redistribute it and/or modify it     *
*  under the terms of the GNU General Public License as published by the       *
*  Free Software Foundation; either version 2 of the License, or (at your      *
*  option) any later version.                                                  *
*                                                                              *
*  This program is distributed in the hope that it will be useful, but WITHOUT *
*  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
*  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
*  more details.                                                               *
*                                                                              *
*  You should have received a copy of the GNU General Public License along     *
*  with this program; see the file GPL.txt.  If not, you may view one at       *
*  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
*  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
*  02111-1307, USA.                                                            *
*                                                                              *
*******************************************************************************/
#if 0
COMPILER BUG NOTE: If encounter errors of the following kinds:
clang:
	fatal error: error in backend: Ran out of registers during register allocation!
	Please check your inline asm statement for invalid constraints:
gcc:
	error: cannot find a register in class ‘GENERAL_REGS’ while reloading ‘asm’
	error: ‘asm’ operand has impossible constraints

Check the compile optimization level - If -O0, try upping to at east -O1.
#endif
/*******************************************************************************
   We now include this header file if it was not included before.
*******************************************************************************/
#ifndef carry_gcc_h_included
#define carry_gcc_h_included

	/************** See the Visual-studio-style 32-bit analogs of these in carry.h for commented versions: **********/

#ifdef USE_ARM_V8_SIMD

	/*************************************************************/
	/**************** MERSENNE-MOD CARRY MACROS ******************/
	/*************************************************************/

	/***
	To-Do: See if LD1 faster than LDP. Ref: https://stackoverflow.com/questions/29742844/a64-neon-simd-256-bit-comparison
	***/
	// Use the following x86_64-to-ARMv8 GPR name translations: r[a-d]x,rsi,rdi -> x0-5:
	#define SSE2_cmplx_carry_fast_pow2_wtsinit(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xn_minus_sil2,Xn_minus_silp2,Xsinwt2,Xsinwtm2, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
	/**********************************************/\
	/*  (j  ),  Real      parts                   */\
	/**********************************************/\
		"ldr	x4,%[__half_arr]	\n\t"\
		"ldr	q30,[x4,#0x30]		\n\t"\
		"ldr	q31,[x4,#0x70]		\n\t"\
		"ldr	x5,%[__sse_bw]		\n\t	ldr	q6,[x5]	\n\t"\
		"ldr	x6,%[__sse_nm1]		\n\t	ldr	q7,[x6]	\n\t"\
		/* For the ARMv8 sans-table-lookup impl, Here are the needed consts and opmasks.
		[1] Fwd-wt multipliers: Init = 0.50 x 2, anytime SSE2-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 2, anytime SSE2-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"ldr	x0,%[__bjmod_0]		\n\t"/* Pointer to bjmodn data */\
		"ldr	q0,[x0]				\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr	w2,%[__n_minus_sil]	\n\t	ldr	w3,%[__sinwt]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"/* Broadcast via LD1R only works from *pointers*, so use DUP */\
		"ldr	x5,%[__wtA]	\n\t"/* stash these 2 ptrs in x5,6 rather than x1,2 to make persistent, even */\
		"ldr	x6,%[__wtB]	\n\t"/* though persistence of x6 unneeded until next block when it takes wtC */\
		/* NB: In ARMv8 asm, '[cmp] vc,vb,va' corr. to vc = (vb [cmp] va): */\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"/* n_minus_sil[v2] >= bjmod[0:3][v0] ? Resulting opmask bit-flipped-analog of SSE2-mode opmask stored in xmm2 */\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"/* bjmod[0:3] [v0] >=      sinwt[v3] ? Resulting opmask bit-flipped-analog of SSE2-mode opmask stored in xmm1 */\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"/* v18 = v8[1]x2,v8[0]x2; v19 = v8[3]x2,v8[2]x2 */\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"/* v28 = v9[1]x2,v9[0]x2, v29 = v9[3]x2,v9[2]x2 */\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"/* one_half[m0-3] multiplier for wt    */\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"/* one_half[n0-3] multiplier for wtinv */\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtB[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"/* swap lo,hi doubles within v3,v5 */\
		"ldr	q8,[x4,#0x180]		\n\t"/* wtl */\
		"ldr	q9,[x4,#0x1a0]		\n\t"/* wtn */\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"/* wt   =wtA*wtl */\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"/* wtinv=wtB*wtn */\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"/* wt   =wt   *one_half[m01] */\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
											/* Get ready for next set [IM0~] : */\
		"stp	q2,q4,[x4,#0x180]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"/* bjmod[0:3] += bw  */\
		"stp	q3,q5,[x4,#0x1a0]	\n\t	and	v0.16b,v0.16b,v7.16b	\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*  (j  ),  Imaginary parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_silp1]	\n\t	ldr	w3,%[__sinwtm1]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"ldr	x6,%[__wtC]	\n\t"/* No need to reload x6 from hereon */\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x1c0]		\n\t"\
		"ldr	q9,[x4,#0x1e0]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
											/* Get ready for next set [RE1~] : */\
		"stp	q2,q4,[x4,#0x1c0]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"\
		"stp	q3,q5,[x4,#0x1e0]	\n\t	and	v0.16b,v0.16b,v7.16b	\n\t"\
	/**********************************************/\
	/*  (j+2),  Real      parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_sil2]	\n\t	ldr	w3,%[__sinwt2]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x200]		\n\t"\
		"ldr	q9,[x4,#0x220]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
											/* Get ready for next set [IM1~] : */\
		"stp	q2,q4,[x4,#0x200]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"\
		"stp	q3,q5,[x4,#0x220]	\n\t	and	v0.16b,v0.16b,v7.16b	\n\t"\
	/**********************************************/\
	/*  (j+2),  Imaginary parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_silp2]	\n\t	ldr	w3,%[__sinwtm2]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x240]		\n\t"\
		"ldr	q9,[x4,#0x260]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
		"stp	q2,q4,[x4,#0x240]	\n\t"\
		"stp	q3,q5,[x4,#0x260]	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__n_minus_sil2]	"m" (Xn_minus_sil2)	\
		, [__n_minus_silp2] "m" (Xn_minus_silp2)\
		, [__sinwt2]	"m" (Xsinwt2)		\
		, [__sinwtm2]	"m" (Xsinwtm2)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6"\
		,"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v18","v19","v28","v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// Non-power-of-2-length version of above differs only in how we reduce (mod n) in the index computations:
	#define SSE2_cmplx_carry_fast_wtsinit(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xn_minus_sil2,Xn_minus_silp2,Xsinwt2,Xsinwtm2, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
	/**********************************************/\
	/*  (j  ),  Real      parts                   */\
	/**********************************************/\
		"ldr	x4,%[__half_arr]	\n\t"\
		"ldr	q30,[x4,#0x30]		\n\t"\
		"ldr	q31,[x4,#0x70]		\n\t"\
		"ldr	x5,%[__sse_bw]		\n\t	ldr	q6,[x5]	\n\t"\
		"ldr	x6,%[__sse_n]		\n\t	ldr	q7,[x6]	\n\t"\
		/* For the ARMv8 sans-table-lookup impl, Here are the needed consts and opmasks.
		[1] Fwd-wt multipliers: Init = 0.50 x 2, anytime SSE2-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 2, anytime SSE2-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"ldr	x0,%[__bjmod_0]		\n\t"/* Pointer to bjmodn data */\
		"ldr	q0,[x0]				\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr	w2,%[__n_minus_sil]	\n\t	ldr	w3,%[__sinwt]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"/* Broadcast via LD1R only works from *pointers*, so use DUP */\
		"ldr	x5,%[__wtA]	\n\t"/* stash these 2 ptrs in x5,6 rather than x1,2 to make persistent, even */\
		"ldr	x6,%[__wtB]	\n\t"/* though persistence of x6 unneeded until next block when it takes wtC */\
		/* NB: In ARMv8 asm, '[cmp] vc,vb,va' corr. to vc = (vb [cmp] va): */\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"/* n_minus_sil[v2] >= bjmod[0:3][v0] ? Resulting opmask bit-flipped-analog of SSE2-mode opmask stored in xmm2 */\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"/* bjmod[0:3] [v0] >=      sinwt[v3] ? Resulting opmask bit-flipped-analog of SSE2-mode opmask stored in xmm1 */\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"/* v18 = v8[1]x2,v8[0]x2; v19 = v8[3]x2,v8[2]x2 */\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"/* v28 = v9[1]x2,v9[0]x2, v29 = v9[3]x2,v9[2]x2 */\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"/* one_half[m0-3] multiplier for wt    */\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"/* one_half[n0-3] multiplier for wtinv */\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtB[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"/* swap lo,hi doubles within v3,v5 */\
		"ldr	q8,[x4,#0x180]		\n\t"/* wtl */\
		"ldr	q9,[x4,#0x1a0]		\n\t"/* wtn */\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"/* wt   =wtA*wtl */\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"/* wtinv=wtB*wtn */\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"/* wt   =wt   *one_half[m01] */\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
											/* Get ready for next set [IM0~] : */\
		"stp	q2,q4,[x4,#0x180]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"/* bjmod[0:3] += bw  */\
		"stp	q3,q5,[x4,#0x1a0]	\n\t   cmge	v8.4s, v0.4s, v7.4s		\n\t"/* bjmod[0:3][v0] >= n[v7] ? */\
		"									and	v9.16b,v8.16b,v7.16b	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"									sub	v0.4s, v0.4s, v9.4s		\n\t"\
	/**********************************************/\
	/*  (j  ),  Imaginary parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_silp1]	\n\t	ldr	w3,%[__sinwtm1]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"ldr	x6,%[__wtC]	\n\t"/* No need to reload x6 from hereon */\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x1c0]		\n\t"\
		"ldr	q9,[x4,#0x1e0]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
											/* Get ready for next set [RE1~] : */\
		"stp	q2,q4,[x4,#0x1c0]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"\
		"stp	q3,q5,[x4,#0x1e0]	\n\t   cmge	v8.4s, v0.4s, v7.4s		\n\t"\
		"									and	v9.16b,v8.16b,v7.16b	\n\t"\
		"									sub	v0.4s, v0.4s, v9.4s		\n\t"\
	/**********************************************/\
	/*  (j+2),  Real      parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_sil2]	\n\t	ldr	w3,%[__sinwt2]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x200]		\n\t"\
		"ldr	q9,[x4,#0x220]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
											/* Get ready for next set [IM1~] : */\
		"stp	q2,q4,[x4,#0x200]	\n\t	add	v0.4s ,v0.4s ,v6.4s		\n\t"\
		"stp	q3,q5,[x4,#0x220]	\n\t   cmge	v8.4s, v0.4s, v7.4s		\n\t"\
		"									and	v9.16b,v8.16b,v7.16b	\n\t"\
		"									sub	v0.4s, v0.4s, v9.4s		\n\t"\
	/**********************************************/\
	/*  (j+2),  Imaginary parts                   */\
	/**********************************************/\
		"mov	v1.16b,v0.16b		\n\t"/* bjmod[0:3] COPY in xmm1 */\
		"ldr w2,%[__n_minus_silp2]	\n\t	ldr	w3,%[__sinwtm2]	\n\t"\
		"dup	v2.4s,w2			\n\t	dup	v3.4s,w3		\n\t"\
		"cmge	v8.4s,v2.4s,v0.4s	\n\t"\
		"cmge	v9.4s,v0.4s,v3.4s	\n\t"\
		"zip1	v18.4s,v8.4s,v8.4s	\n\t	zip2	v19.4s,v8.4s,v8.4s	\n\t"\
		"zip1	v28.4s,v9.4s,v9.4s	\n\t	zip2	v29.4s,v9.4s,v9.4s	\n\t"\
		"and v18.16b,v18.16b,v30.16b\n\t	and	v19.16b,v19.16b,v30.16b	\n\t"\
		"and v28.16b,v28.16b,v31.16b\n\t	and	v29.16b,v29.16b,v31.16b	\n\t"\
		"fadd	v18.2d,v18.2d,v30.2d\n\t	fadd	v19.2d,v19.2d,v30.2d\n\t"\
		"fadd	v28.2d,v28.2d,v31.2d\n\t	fadd	v29.2d,v29.2d,v31.2d\n\t"\
		"ldp	q2,q4,[x5]			\n\t"/* wtA[j  ] */\
		"ldp	q5,q3,[x6,#-0x10]	\n\t"/* wtC[j-1] */\
		"ext v3.16b,v3.16b,v3.16b,#8\n\t"\
		"ext v5.16b,v5.16b,v5.16b,#8\n\t"\
		"ldr	q8,[x4,#0x240]		\n\t"\
		"ldr	q9,[x4,#0x260]		\n\t"\
		"fmul	v2.2d,v8.2d ,v2.2d	\n\t	fmul	v4.2d,v8.2d ,v4.2d	\n\t"\
		"fmul	v3.2d,v9.2d ,v3.2d	\n\t	fmul	v5.2d,v9.2d ,v5.2d	\n\t"\
		"fmul	v2.2d,v18.2d,v2.2d	\n\t	fmul	v4.2d,v19.2d,v4.2d	\n\t"\
		"fmul	v3.2d,v28.2d,v3.2d	\n\t	fmul	v5.2d,v29.2d,v5.2d	\n\t"\
		"stp	q2,q4,[x4,#0x240]	\n\t"\
		"stp	q3,q5,[x4,#0x260]	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__n_minus_sil2]	"m" (Xn_minus_sil2)	\
		, [__n_minus_silp2] "m" (Xn_minus_silp2)\
		, [__sinwt2]	"m" (Xsinwt2)		\
		, [__sinwtm2]	"m" (Xsinwtm2)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6"\
		,"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v18","v19","v28","v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// "Fused" means that - like the HIACC macros - we process 4 carry chains, one from each separate array section corr.
	// to each wide-strided final-iFFT-pass output, at a time, but fuse the [j,j+2] linear-index-within-each-array-section
	// processing (done separately in the HIACC case by the 1_2B and 2_2B SSE2 carry macros) into a single macro. This
	// fusion is eased by the fact that the LOACC chained-weights-computation needs no weights-reinit-from-scalar-data
	// step for the [j+2] data.
	//
	#define SSE2_cmplx_carry_fast_pow2_errcheck(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		/***************Unpack the data:*************************/\
		"ldr	x0,%[__data]		\n\t"\
		"ldp	q0,q1,[x0      ]	\n\t	ldp	q4 ,q5 ,[x0,#0x40]	\n\t"\
		"ldp	q8,q9,[x0,#0x20]	\n\t	ldp	q10,q11,[x0,#0x60]	\n\t"\
		"trn2	v2.2d,v0.2d,v8.2d	\n\t	trn2	v6.2d,v4.2d,v10.2d	\n\t"\
		"trn2	v3.2d,v1.2d,v9.2d	\n\t	trn2	v7.2d,v5.2d,v11.2d	\n\t"\
		"trn1	v0.2d,v0.2d,v8.2d	\n\t	trn1	v4.2d,v4.2d,v10.2d	\n\t"\
		"trn1	v1.2d,v1.2d,v9.2d	\n\t	trn1	v5.2d,v5.2d,v11.2d	\n\t"\
		"ldr	x1,%[__bjmod_0]		\n\t"/* Pointer to bjmodn data */\
		"ldr	x2,%[__sse_sw]		\n\t"\
		"ldr	w3,%[__i]			\n\t"\
		"eor v30.16b,v30.16b,v30.16b\n\t"/* Zero v30 */\
		"ldr	q8,[x1]				\n\t"/* bjmod[0:3], PERSISTENT COPY IN V8 */\
		"ldr	q9,[x2]				\n\t"/* sw, 4-fold, PERSISTENT COPY IN V9 */\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"/* sw[v9] >= bjmod[0:3][v8] ? Resulting opmask bit-flipped-analog of SSE2-mode (sw < bjmod) opmask stored in xmm7 */\
		"ins	v30.s[0],w3			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by adding I to to the low int32 lane of v9 */\
		"ldr	x3,%[__sse_bw]		\n\t	ldr	q10,[x3]	\n\t"/* bw , 4-fold, PERSISTENT COPY IN V10 */\
		"ldr	x4,%[__sse_nm1]		\n\t	ldr	q11,[x4]	\n\t"/* nm1, 4-fold, PERSISTENT COPY IN V11 */\
		"add	v31.4s,v31.4s,v30.4s\n\t"/* (which == 111...111 on input in this case), thus zeroing it. Otherwise I == 0, thus the add = no-op. */\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"/* v29 = v31[1]x2,v31[0]x2; v31 = v31[3]x2,v31[2]x2 */\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"/* negative-masks; v28-31 are ARM analogs of AVX-512 opmasks k1-4 */\
		"ldr	x2,%[__half_arr]	\n\t"\
		"ldr	x3,%[__cyA]			\n\t	ldr	x4,%[__cyB]			\n\t"\
		"ldr	q14,[x2,#-0x20]		\n\t"/* maxerr, then place 2nd copy in v15, allowing both cols */\
		"mov	v15.16b,v14.16b		\n\t"/* to do independent updates with just one merge at end */\
		/* Load the 4 double-pairs base[0]x2,baseinv[1]x2,wts_mult[1]x2,inv_mult[0]x2 and also compute 2x each one
		for the BSL operations we use to effect the conditional-doublings of these consts. (The register-leaner
		alternative is to just store the 4 lcol-loaded vectors and do an AND/ADD pair for each conditional-doubling,
		in place of just a single BSL). ARM's 32 vector-registers come in handy here, because this carry macro is
		sufficiently intricate that we use all 32 of the vregs: */\
	/* Mar 2018: Need to free a vreg to store prp_mult, su use v21 and do v20-doublings on-the-fly: */\
	"ldr	x5,%[__prp_mult]	\n\t	ld1r	{v21.2d},[x5]		\n\t"\
		"ldr	q20,[x2,#0x80]		\n\t"/* base[0] */\
		"ldr	q22,[x2,#0xf0]		\n\t"/*	fadd	v21.2d,v20.2d,v20.2d\n\t*** baseinv[1]	[base[0]x2] */\
		"ldr	q24,[x2,#0x130]		\n\t	fadd	v23.2d,v22.2d,v22.2d\n\t"/* wts_mult[1]	baseinv[1]x2 */\
		"ldr	q26,[x2,#0x140]		\n\t	fadd	v25.2d,v24.2d,v24.2d\n\t"/* inv_mult[0]	wts_mult[1]x2 (needed for conditional-doubling) */\
		"									fadd	v27.2d,v26.2d,v26.2d\n\t"/* 			inv_mult[0]x2 (needed for (wt_re >= inv_mult[1]) comparison) */\
/**** 12,13,18,19 FREE ****/\
	/*******************************/\
	/* Do A.re pair: Data in v0,4: */\
	/*******************************/\
	"ldr	x5,%[__add0]		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
	"prfm	PLDL1KEEP,[x5]		\n\t"\
		"ldp	q18,q19,[x2,#0x1a0]	\n\t"/* wi_re ... need to preserve these 2 regs for inv_mult-update at end of block */\
		"fmul	v16.2d,v0.2d,v18.2d	\n\t	fmul	v17.2d,v4.2d,v19.2d	\n\t"/* x *= wtinv */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"/* [4] Inv-base mults */\
		"frintn	v12.2d,v16.2d		\n\t	frintn	v13.2d,v17.2d		\n\t"/* DNINT(x) */\
	"ldr q0,[x3] \n\t ldr q4,[x4] \n\t"/* cyA,B for our 1 independent carry-chain pairs */\
		"fmla	v0.2d,v12.2d,v21.2d	\n\t	fmla	v4.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub v12.2d,v16.2d,v12.2d	\n\t	fsub v13.2d,v17.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul v29.2d, v0.2d,v29.2d	\n\t	fmul v31.2d, v4.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v29.2d		\n\t	frintn	v17.2d,v31.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"/* frac = fabs(x - DNINT(x)) */\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"/* if(frac > maxerr) maxerr=frac */\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"/* [3] Fwd-base mults */\
		"ldp	q12,q13,[x2,#0x180]	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		"fmls	v0.2d,v16.2d,v28.2d	\n\t	fmls	v4.2d,v17.2d,v30.2d	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"/* v29,31 = (wt >= inv_mult[1]) */\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"/* inverse-masks */\
		"fmul	v0.2d,v0.2d,v12.2d	\n\t	fmul	v4.2d,v4.2d,v13.2d	\n\t"/* x *= wt */\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"/* [5] wts_mult */\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"/* wt *= wts_mult[i] */\
		"stp	q12,q13,[x2,#0x180]	\n\t"/* Store wt */\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"/* [6] inv_mult */\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"/* wi *= inv_mult[i] */\
		"stp	q18,q19,[x2,#0x1a0]	\n\t"/* Store wi */\
		/* Apr 2018: Haven't found a way to support prp_mult in the ARMv8 macros w/o storing 2 vec-data to free
		a reg-pair. Spilling v0,4 seems cheapest because only need to restore it for closing transpose step: */\
		"stp	q0,q4,[x0] 			\n\t"\
		/* Get ready for next set [IM0~] : */\
		"add	v8.4s ,v8.4s ,v10.4s	\n\t"/* bjmod[0:3] += bw  */\
		"and	v8.16b,v8.16b,v11.16b	\n\t"/* bjmod[0:3] &= nm1 */\
	/*******************************/\
	/* Do A.im pair: Data in v1,5: */\
	/*******************************/\
	"ldr	w6,%[__p1]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"/* sw[v9] >= bjmod[0:3][v8] ? */\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x1e0]	\n\t"\
		"fmul	v0.2d,v1.2d,v18.2d	\n\t	fmul	v4.2d,v5.2d,v19.2d	\n\t"/* x *= wtinv */\
		"mov	v1.16b,v16.16b		\n\t	mov	v5.16b,v17.16b			\n\t"/* copy carries into v1,5 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v1.2d,v12.2d,v21.2d	\n\t	fmla	v5.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v1.2d,v29.2d	\n\t	fmul	v17.2d,v5.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"/* frac = fabs(x - DNINT(x)) */\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"/* if(frac > maxerr) maxerr=frac */\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"/* [3] Fwd-base mults */\
		"ldp	q12,q13,[x2,#0x1c0]	\n\t"\
		"fmls	v1.2d,v16.2d,v28.2d	\n\t	fmls	v5.2d,v17.2d,v30.2d	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v1.2d,v1.2d,v12.2d	\n\t	fmul	v5.2d,v5.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x1c0]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x1e0]	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"add	v8.4s ,v8.4s ,v10.4s	\n\t"\
		"and	v8.16b,v8.16b,v11.16b	\n\t"\
	/**********************************************/\
	/*          Now do the (j+2) data:            */\
	/**********************************************/\
	/*******************************/\
	/* Do B.re pair: Data in v2,6: */\
	/*******************************/\
	"ldr	w6,%[__p2]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x220]	\n\t"\
		"fmul	v0.2d,v2.2d,v18.2d	\n\t	fmul	v4.2d,v6.2d,v19.2d	\n\t"\
		"mov	v2.16b,v16.16b		\n\t	mov	v6.16b,v17.16b			\n\t"/* copy carries into v2,6 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v2.2d,v12.2d,v21.2d	\n\t	fmla	v6.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v2.2d,v29.2d	\n\t	fmul	v17.2d,v6.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"\
		"ldp	q12,q13,[x2,#0x200]	\n\t"\
		"fmls	v2.2d,v16.2d,v28.2d	\n\t	fmls	v6.2d,v17.2d,v30.2d	\n\t"\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v2.2d,v2.2d,v12.2d	\n\t	fmul	v6.2d,v6.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x200]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x220]	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"add	v8.4s ,v8.4s ,v10.4s	\n\t"\
		"and	v8.16b,v8.16b,v11.16b	\n\t"\
	/*******************************/\
	/* Do B.im pair: Data in v3,7: */\
	/*******************************/\
	"ldr	w6,%[__p3]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x260]	\n\t"\
		"fmul	v0.2d,v3.2d,v18.2d	\n\t	fmul	v4.2d,v7.2d,v19.2d	\n\t"\
		"mov	v3.16b,v16.16b		\n\t	mov	v7.16b,v17.16b			\n\t"/* copy carries into v3,7 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v3.2d,v12.2d,v21.2d	\n\t	fmla	v7.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v3.2d,v29.2d	\n\t	fmul	v17.2d,v7.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"\
		"ldp	q12,q13,[x2,#0x240]	\n\t"\
		"fmls	v3.2d,v16.2d,v28.2d	\n\t	fmls	v7.2d,v17.2d,v30.2d	\n\t"\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v3.2d,v3.2d,v12.2d	\n\t	fmul	v7.2d,v7.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x240]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x260]	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"add	v8.4s ,v8.4s ,v10.4s	\n\t"\
		"and	v8.16b,v8.16b,v11.16b	\n\t"\
		"str	q8,[x1]					\n\t"/* write bjmod[0:3] */\
		"str q16,[x3] \n\t str q17,[x4] \n\t"/* write cyA,B */\
		"fmax	v14.2d,v14.2d,v15.2d	\n\t"/* maxerr between the 2 cols */\
		"str	q14,[x2,#-0x20]			\n\t"/* write maxerr */\
		/**********************************************/\
		/*              Repack the data:              */\
		/**********************************************/\
		"ldp	q0,q4,[x0] 			\n\t"/* Restore earlier-spilled data pair */\
		"trn2	v8.2d,v0.2d,v2.2d	\n\t	trn2	v10.2d,v4.2d,v6.2d	\n\t"\
		"trn2	v9.2d,v1.2d,v3.2d	\n\t	trn2	v11.2d,v5.2d,v7.2d	\n\t"\
		"trn1	v0.2d,v0.2d,v2.2d	\n\t	trn1	v4.2d ,v4.2d,v6.2d	\n\t"\
		"trn1	v1.2d,v1.2d,v3.2d	\n\t	trn1	v5.2d ,v5.2d,v7.2d	\n\t"\
		"stp	q0,q1,[x0      ]	\n\t	stp	q4 ,q5 ,[x0,#0x40]	\n\t"\
		"stp	q8,q9,[x0,#0x20]	\n\t	stp	q10,q11,[x0,#0x60]	\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__i]			"m" (Xi)			\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15",\
					"v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// Non-power-of-2-length version of above differs only in how we reduce (mod n) in the index computations:
	#define SSE2_cmplx_carry_fast_errcheck(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		/***************Unpack the data:*************************/\
		"ldr	x0,%[__data]		\n\t"\
		"ldp	q0,q1,[x0      ]	\n\t	ldp	q4 ,q5 ,[x0,#0x40]	\n\t"\
		"ldp	q8,q9,[x0,#0x20]	\n\t	ldp	q10,q11,[x0,#0x60]	\n\t"\
		"trn2	v2.2d,v0.2d,v8.2d	\n\t	trn2	v6.2d,v4.2d,v10.2d	\n\t"\
		"trn2	v3.2d,v1.2d,v9.2d	\n\t	trn2	v7.2d,v5.2d,v11.2d	\n\t"\
		"trn1	v0.2d,v0.2d,v8.2d	\n\t	trn1	v4.2d,v4.2d,v10.2d	\n\t"\
		"trn1	v1.2d,v1.2d,v9.2d	\n\t	trn1	v5.2d,v5.2d,v11.2d	\n\t"\
		"ldr	x1,%[__bjmod_0]		\n\t"/* Pointer to bjmodn data */\
		"ldr	x2,%[__sse_sw]		\n\t"\
		"ldr	w3,%[__i]			\n\t"\
		"eor v30.16b,v30.16b,v30.16b\n\t"/* Zero v30 */\
		"ldr	q8,[x1]				\n\t"/* bjmod[0:3], PERSISTENT COPY IN V8 */\
		"ldr	q9,[x2]				\n\t"/* sw, 4-fold, PERSISTENT COPY IN V9 */\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"/* sw[v9] >= bjmod[0:3][v8] ? Resulting opmask bit-flipped-analog of SSE2-mode (sw < bjmod) opmask stored in xmm7 */\
		"ins	v30.s[0],w3			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by adding I to to the low int32 lane of v9 */\
		"ldr	x3,%[__sse_bw]		\n\t	ldr	q10,[x3]	\n\t"/* bw, 4-fold, PERSISTENT COPY IN V10 */\
		"ldr	x4,%[__sse_n]		\n\t	ldr	q11,[x4]	\n\t"/* n , 4-fold, PERSISTENT COPY IN V11 */\
		"add	v31.4s,v31.4s,v30.4s\n\t"/* (which == 111...111 on input in this case), thus zeroing it. Otherwise I == 0, thus the add = no-op. */\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"/* v29 = v31[1]x2,v31[0]x2; v31 = v31[3]x2,v31[2]x2 */\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"/* negative-masks; v28-31 are ARM analogs of AVX-512 opmasks k1-4 */\
		"ldr	x2,%[__half_arr]	\n\t"\
		"ldr	x3,%[__cyA]			\n\t	ldr	x4,%[__cyB]			\n\t"\
		"ldr	q14,[x2,#-0x20]		\n\t"/* maxerr, then place 2nd copy in v15, allowing both cols */\
		"mov	v15.16b,v14.16b		\n\t"/* to do independent updates with just one merge at end */\
		/* Load the 4 double-pairs base[0]x2,baseinv[1]x2,wts_mult[1]x2,inv_mult[0]x2 and also compute 2x each one
		for the BSL operations we use to effect the conditional-doublings of these consts. (The register-leaner
		alternative is to just store the 4 lcol-loaded vectors and do an AND/ADD pair for each conditional-doubling,
		in place of just a single BSL). ARM's 32 vector-registers come in handy here, because this carry macro is
		sufficiently intricate that we use all 32 of the vregs: */\
	/* Mar 2018: Need to free a vreg to store prp_mult, su use v21 and do v20-doublings on-the-fly: */\
	"ldr	x5,%[__prp_mult]	\n\t	ld1r	{v21.2d},[x5]		\n\t"\
		"ldr	q20,[x2,#0x80]		\n\t"/* base[0] */\
		"ldr	q22,[x2,#0xf0]		\n\t"/*	fadd	v21.2d,v20.2d,v20.2d\n\t*** baseinv[1]	[base[0]x2] */\
		"ldr	q24,[x2,#0x130]		\n\t	fadd	v23.2d,v22.2d,v22.2d\n\t"/* wts_mult[1]	baseinv[1]x2 */\
		"ldr	q26,[x2,#0x140]		\n\t	fadd	v25.2d,v24.2d,v24.2d\n\t"/* inv_mult[0]	wts_mult[1]x2 (needed for conditional-doubling) */\
		"									fadd	v27.2d,v26.2d,v26.2d\n\t"/* 			inv_mult[0]x2 (needed for (wt_re >= inv_mult[1]) comparison) */\
/**** 12,13,18,19 FREE ****/\
	/*******************************/\
	/* Do A.re pair: Data in v0,4: */\
	/*******************************/\
	"ldr	x5,%[__add0]		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
	"prfm	PLDL1KEEP,[x5]		\n\t"\
		"ldp	q18,q19,[x2,#0x1a0]	\n\t"/* wi_re ... need to preserve these 2 regs for inv_mult-update at end of block */\
		"fmul	v16.2d,v0.2d,v18.2d	\n\t	fmul	v17.2d,v4.2d,v19.2d	\n\t"/* x *= wtinv */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"/* [4] Inv-base mults */\
		"frintn	v12.2d,v16.2d		\n\t	frintn	v13.2d,v17.2d		\n\t"/* DNINT(x) */\
	"ldr q0,[x3] \n\t ldr q4,[x4] \n\t"/* cyA,B for our 1 independent carry-chain pairs */\
		"fmla	v0.2d,v12.2d,v21.2d	\n\t	fmla	v4.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub v12.2d,v16.2d,v12.2d	\n\t	fsub v13.2d,v17.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul v29.2d, v0.2d,v29.2d	\n\t	fmul v31.2d, v4.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v29.2d		\n\t	frintn	v17.2d,v31.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"/* frac = fabs(x - DNINT(x)) */\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"/* if(frac > maxerr) maxerr=frac */\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"/* [3] Fwd-base mults */\
		"ldp	q12,q13,[x2,#0x180]	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		"fmls	v0.2d,v16.2d,v28.2d	\n\t	fmls	v4.2d,v17.2d,v30.2d	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"/* v29,31 = (wt >= inv_mult[1]) */\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"/* inverse-masks */\
		"fmul	v0.2d,v0.2d,v12.2d	\n\t	fmul	v4.2d,v4.2d,v13.2d	\n\t"/* x *= wt */\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"/* [5] wts_mult */\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"/* wt *= wts_mult[i] */\
		"stp	q12,q13,[x2,#0x180]	\n\t"/* Store wt */\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"/* [6] inv_mult */\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"/* wi *= inv_mult[i] */\
		"stp	q18,q19,[x2,#0x1a0]	\n\t"/* Store wi */\
		/* Apr 2018: Haven't found a way to support prp_mult in the ARMv8 macros w/o storing 2 vec-data to free
		a reg-pair. Spilling v0,4 seems cheapest because only need to restore it for closing transpose step: */\
		"stp	q0,q4,[x0] 			\n\t"\
		/* Get ready for next set [IM0~] : */\
		"add	 v8.4s , v8.4s ,v10.4s	\n\t"/* bjmod[0:3] += bw  */\
		"cmge	v18.4s , v8.4s ,v11.4s	\n\t"/* bjmod[0:3][v8] >= n[v11] ? */\
		"and	v19.16b,v18.16b,v11.16b	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"sub	 v8.4s , v8.4s ,v19.4s	\n\t"\
	/*******************************/\
	/* Do A.im pair: Data in v1,5: */\
	/*******************************/\
	"ldr	w6,%[__p1]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"/* sw[v9] >= bjmod[0:3][v8] ? */\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x1e0]	\n\t"\
		"fmul	v0.2d,v1.2d,v18.2d	\n\t	fmul	v4.2d,v5.2d,v19.2d	\n\t"/* x *= wtinv */\
		"mov	v1.16b,v16.16b		\n\t	mov	v5.16b,v17.16b			\n\t"/* copy carries into v1,5 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v1.2d,v12.2d,v21.2d	\n\t	fmla	v5.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v1.2d,v29.2d	\n\t	fmul	v17.2d,v5.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"/* frac = fabs(x - DNINT(x)) */\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"/* if(frac > maxerr) maxerr=frac */\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"/* [3] Fwd-base mults */\
		"ldp	q12,q13,[x2,#0x1c0]	\n\t"\
		"fmls	v1.2d,v16.2d,v28.2d	\n\t	fmls	v5.2d,v17.2d,v30.2d	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v1.2d,v1.2d,v12.2d	\n\t	fmul	v5.2d,v5.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x1c0]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x1e0]	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"add	 v8.4s , v8.4s ,v10.4s	\n\t"\
		"cmge	v18.4s , v8.4s ,v11.4s	\n\t"\
		"and	v19.16b,v18.16b,v11.16b	\n\t"\
		"sub	 v8.4s , v8.4s ,v19.4s	\n\t"\
	/**********************************************/\
	/*          Now do the (j+2) data:            */\
	/**********************************************/\
	/*******************************/\
	/* Do B.re pair: Data in v2,6: */\
	/*******************************/\
	"ldr	w6,%[__p2]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x220]	\n\t"\
		"fmul	v0.2d,v2.2d,v18.2d	\n\t	fmul	v4.2d,v6.2d,v19.2d	\n\t"\
		"mov	v2.16b,v16.16b		\n\t	mov	v6.16b,v17.16b			\n\t"/* copy carries into v2,6 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v2.2d,v12.2d,v21.2d	\n\t	fmla	v6.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v2.2d,v29.2d	\n\t	fmul	v17.2d,v6.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"\
		"ldp	q12,q13,[x2,#0x200]	\n\t"\
		"fmls	v2.2d,v16.2d,v28.2d	\n\t	fmls	v6.2d,v17.2d,v30.2d	\n\t"\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v2.2d,v2.2d,v12.2d	\n\t	fmul	v6.2d,v6.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x200]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x220]	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"add	 v8.4s , v8.4s ,v10.4s	\n\t"\
		"cmge	v18.4s , v8.4s ,v11.4s	\n\t"\
		"and	v19.16b,v18.16b,v11.16b	\n\t"\
		"sub	 v8.4s , v8.4s ,v19.4s	\n\t"\
	/*******************************/\
	/* Do B.im pair: Data in v3,7: */\
	/*******************************/\
	"ldr	w6,%[__p3]	\n\t	prfm	PLDL1KEEP,[x5,x6,LSL #3]	\n\t"\
		"cmge	v31.4s,v9.4s,v8.4s	\n\t"\
		"zip1	v29.4s,v31.4s,v31.4s\n\t	zip2	v31.4s,v31.4s,v31.4s\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"ldp	q18,q19,[x2,#0x260]	\n\t"\
		"fmul	v0.2d,v3.2d,v18.2d	\n\t	fmul	v4.2d,v7.2d,v19.2d	\n\t"\
		"mov	v3.16b,v16.16b		\n\t	mov	v7.16b,v17.16b			\n\t"/* copy carries into v3,7 in prep for FMA */\
		"bsl v29.16b,v23.16b,v22.16b\n\t	bsl v31.16b,v23.16b,v22.16b	\n\t"\
		"frintn	v12.2d,v0.2d		\n\t	frintn	v13.2d,v4.2d		\n\t"/* DNINT(x) */\
		"fmla	v3.2d,v12.2d,v21.2d	\n\t	fmla	v7.2d,v13.2d,v21.2d	\n\t"/* temp = DNINT(x)*prp_mult + cy */\
		"fsub	v12.2d,v0.2d,v12.2d	\n\t	fsub	v13.2d,v4.2d,v13.2d	\n\t"/* x - DNINT(x) */\
		"fmul	v16.2d,v3.2d,v29.2d	\n\t	fmul	v17.2d,v7.2d,v31.2d	\n\t"/* temp*baseinv */\
		"frintn	v16.2d,v16.2d		\n\t	frintn	v17.2d,v17.2d		\n\t"/* cy = DNINT(temp*baseinv[i]): */\
		"fadd	v29.2d,v20.2d,v20.2d\n\t"/* base[0]x2 */\
		"fabs	v12.2d,v12.2d		\n\t	fabs	v13.2d,v13.2d		\n\t"\
		"fmax	v14.2d,v14.2d,v12.2d\n\t	fmax	v15.2d,v15.2d,v13.2d\n\t"\
		"bsl v28.16b,v29.16b,v20.16b\n\t	bsl	v30.16b,v29.16b,v20.16b	\n\t"\
		"ldp	q12,q13,[x2,#0x240]	\n\t"\
		"fmls	v3.2d,v16.2d,v28.2d	\n\t	fmls	v7.2d,v17.2d,v30.2d	\n\t"\
		/* Update and store weights: */\
		"cmge	v29.2d,v12.2d,v27.2d\n\t	cmge	v31.2d,v13.2d,v27.2d\n\t"\
		"not	v28.16b,v29.16b		\n\t	not		v30.16b,v31.16b		\n\t"\
		"fmul	v3.2d,v3.2d,v12.2d	\n\t	fmul	v7.2d,v7.2d,v13.2d	\n\t"\
		"bsl v28.16b,v25.16b,v24.16b\n\t	bsl	v30.16b,v25.16b,v24.16b	\n\t"\
		"fmul	v12.2d,v12.2d,v28.2d\n\t	fmul	v13.2d,v13.2d,v30.2d\n\t"\
		"stp	q12,q13,[x2,#0x240]	\n\t"\
		"bsl v29.16b,v27.16b,v26.16b\n\t	bsl	v31.16b,v27.16b,v26.16b	\n\t"\
		"fmul	v18.2d,v18.2d,v29.2d\n\t	fmul	v19.2d,v19.2d,v31.2d\n\t"\
		"stp	q18,q19,[x2,#0x260]	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"add	 v8.4s , v8.4s ,v10.4s	\n\t"\
		"cmge	v18.4s , v8.4s ,v11.4s	\n\t"\
		"and	v19.16b,v18.16b,v11.16b	\n\t"\
		"sub	 v8.4s , v8.4s ,v19.4s	\n\t"\
		"str	q8,[x1]					\n\t"/* write bjmod[0:3] */\
		"str q16,[x3] \n\t str q17,[x4] \n\t"/* write cyA,B */\
		"fmax	v14.2d,v14.2d,v15.2d	\n\t"/* maxerr between the 2 cols */\
		"str	q14,[x2,#-0x20]			\n\t"/* write maxerr */\
		/**********************************************/\
		/*              Repack the data:              */\
		/**********************************************/\
		"ldp	q0,q4,[x0] 			\n\t"/* Restore earlier-spilled data pair */\
		"trn2	v8.2d,v0.2d,v2.2d	\n\t	trn2	v10.2d,v4.2d,v6.2d	\n\t"\
		"trn2	v9.2d,v1.2d,v3.2d	\n\t	trn2	v11.2d,v5.2d,v7.2d	\n\t"\
		"trn1	v0.2d,v0.2d,v2.2d	\n\t	trn1	v4.2d ,v4.2d,v6.2d	\n\t"\
		"trn1	v1.2d,v1.2d,v3.2d	\n\t	trn1	v5.2d ,v5.2d,v7.2d	\n\t"\
		"stp	q0,q1,[x0      ]	\n\t	stp	q4 ,q5 ,[x0,#0x40]	\n\t"\
		"stp	q8,q9,[x0,#0x20]	\n\t	stp	q10,q11,[x0,#0x60]	\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__i]			"m" (Xi)			\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15",\
					"v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// Don't support HIACC version of above Mersenne-mod macros nor Fermat-mod in ARMv8 builds,
	// so just include skeleton versions of the associated 128-bit SIMD carry macros:
	#define SSE2_cmplx_carry_norm_pow2_errcheck1_2B(Xdata,XwtA,XwtB,XwtC,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1, Xprp_mult)\
	{\
		SSE2_cmplx_carry_norm_errcheck1_2B(Xdata,XwtA,XwtB,XwtC,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1, Xprp_mult)\
	}

	#define SSE2_cmplx_carry_norm_errcheck1_2B(Xdata,XwtA,XwtB,XwtC,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__data]		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory"	/* Clobbered registers */\
	);\
	}

	#define SSE2_cmplx_carry_norm_pow2_errcheck2_2B(Xdata,XwtA,XwtB,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp2,Xp3, Xprp_mult)\
	{\
		SSE2_cmplx_carry_norm_errcheck2_2B(Xdata,XwtA,XwtB,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp2,Xp3, Xprp_mult)\
	}

	#define SSE2_cmplx_carry_norm_errcheck2_2B(Xdata,XwtA,XwtB,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__data]		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 2 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory"	/* Clobbered registers */\
	);\
	}

	/*************************************************************/
	/**************** FERMAT  -MOD CARRY MACROS ******************/
	/*************************************************************/

	#define SSE2_fermat_carry_norm_pow2_errcheck_X2(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xhalf_arr,Xsign_mask,Xadd1,Xadd2, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__data]		\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory"	/* Clobbered registers */\
	);\
	}

	#define SSE2_fermat_carry_norm_errcheck_X2(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xodd_radix,Xhalf_arr,Xsign_mask,Xadd1,Xadd2,Xicycle0,Xjcycle0,Xicycle1,Xjcycle1, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__data]		\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__odd_radix]	"m" (Xodd_radix)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		,	[__icycle0]		"m" (Xicycle0)\
		,	[__jcycle0]		"m" (Xjcycle0)\
		,	[__icycle1]		"m" (Xicycle1)\
		,	[__jcycle1]		"m" (Xjcycle1)\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX)

	/*
	Use MOVUPD (or 1-byte-shorter MOVUPS) in legacy 128-bit SSE form to load 2 doubles into lo128 without touching hi128;
	Thus can do MOVUPD m128,xmm1 to fill lo128 of ymm1, then fill hi128 from xmm2/m128 via (with imm8 = 1):

		VINSERTF128 imm8,src2[xmm/m128],src1[ymm1],dest[ymm2]:
		imm8 = 0: dest.lo128 = src2, dest.hi128 = src1.hi128
		imm8 = 1: dest.lo128 = src1.lo128, dest.hi128 = src2

	Once have 4 dcomplex roots loaded into 2 ymm as
	ymm0.lo,hi = [c0,s0,c2,s2]
	ymm1.lo,hi = [c1,s1,c3,s3] ,

	we interleave via

	vunpcklpd ymm0,ymm1,ymmA
	vunpckhpd ymm0,ymm1,ymmB

	to get

	ymmA = [c0,c1,c2,c3]
	ymmB = [s0,s1,s2,s3]

	Similarly for table2 [ = rn1 ] roots to get:

	ymmC = [x0,x1,x2,x3]
	ymmD = [y0,y1,y2,y3]

	then do CMUL:

	vmulpd	ymmA,ymmD,ymmE	// ymmE = c.y
	vmulpd	ymmA,ymmC,ymmA	// ymmA = c.x

	vmulpd	ymmB,ymmC,ymmC	// ymmC = s.x
	vmulpd	ymmB,ymmD,ymmD	// ymmD = s.y

	vsubpd	ymmA,ymmD,ymmA	// ymmA = c.x - s.y; ymmD free
	vsubpd	ymmC,ymmE,ymmB	// ymmB = s.x + c.y; ymmC,E free
	*/

  #ifdef USE_AVX512

	/* Power-of-2-runlength 8-way Fermat-mod acyclic-transform/IBDWT carry macro
	(based on AVX2 version of SSE2_fermat_carry_norm_pow2_errcheck_X4):
	NB: Inanely, VANDPS|PD are not supported in AVX-512F (support starts in AVX-512DQ), so replace with VPANDQ and hope no cycle penalty for type-mixing:
	*/
	#define SSE2_fermat_carry_norm_pow2_errcheck_X8(Xdata,Xbase_root,Xcmul_offset,Xcy_re,Xcy_im,Xhalf_arr,Xsign_mask, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%zmm24	\n\t"/* prp_mult, broadcast to all double-slots of zmm24 */\
		"movq		%[__add0],%%r14		\n\t"/* base address for 8 prefetches-from-main-data-array spread through this macro */\
		/* Base negacyclic roots at this address in [0,2,4,6,8,a,c,e]*0x40 (Re parts), [1,3,5,7,9,b,d,f]*0x40 (Imag parts) */\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movslq	%[__cmul_offset],%%rbx		\n\t"\
		"addq	%%rax,%%rbx	\n\t"/* Index into complex const multipliers block, each applied to 8 sets of base roots */\
		/* Up-multiply octet of negacyclic roots used in this macro invocation, and store results back into local-mem: */\
		"vmovaps	     (%%rbx),%%zmm10	\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-7 */\
		"vmovaps	0x040(%%rbx),%%zmm11	\n\t	"/* c = Re(exp) in zmm0, s = Im(exp) in zmm1 */\
	/* Sets 1/2: */\
		"vmovaps	      (%%rax),%%zmm0	\n\t	vmovaps	 0x080(%%rax),%%zmm2		\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x040(%%rax),%%zmm1	\n\t	vmovaps	 0x0c0(%%rax),%%zmm3		\n\t"/* y = Im part */\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"/* Copy x */\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"/* Copy y */\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"/* c.x */\
		"vmulpd		%%zmm11,%%zmm5,%%zmm5	\n\t	vmulpd		%%zmm11,%%zmm7,%%zmm7	\n\t"/* s.y */\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"/* c.y */\
		"vmulpd		%%zmm11,%%zmm4,%%zmm4	\n\t	vmulpd		%%zmm11,%%zmm6,%%zmm6	\n\t"/* s.x */\
		"vsubpd		%%zmm5 ,%%zmm0,%%zmm0	\n\t	vsubpd		%%zmm7 ,%%zmm2,%%zmm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%zmm4 ,%%zmm1,%%zmm1	\n\t	vaddpd		%%zmm6 ,%%zmm3,%%zmm3	\n\t"/* Out.im = c.y + s.x */\
		"vmovaps	%%zmm0 ,0x000(%%rax)	\n\t	vmovaps		%%zmm2 ,0x080(%%rax)	\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1 ,0x040(%%rax)	\n\t	vmovaps		%%zmm3 ,0x0c0(%%rax)	\n\t"/* Im part */\
	/* Sets 3/4: */\
		"vmovaps	 0x100(%%rax),%%zmm0	\n\t	vmovaps	 0x180(%%rax),%%zmm2		\n\t"\
		"vmovaps	 0x140(%%rax),%%zmm1	\n\t	vmovaps	 0x1c0(%%rax),%%zmm3		\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm11,%%zmm5,%%zmm5	\n\t	vmulpd		%%zmm11,%%zmm7,%%zmm7	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
		"vmulpd		%%zmm11,%%zmm4,%%zmm4	\n\t	vmulpd		%%zmm11,%%zmm6,%%zmm6	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm0,%%zmm0	\n\t	vsubpd		%%zmm7 ,%%zmm2,%%zmm2	\n\t"\
		"vaddpd		%%zmm4 ,%%zmm1,%%zmm1	\n\t	vaddpd		%%zmm6 ,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0 ,0x100(%%rax)	\n\t	vmovaps		%%zmm2 ,0x180(%%rax)	\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1 ,0x140(%%rax)	\n\t	vmovaps		%%zmm3 ,0x1c0(%%rax)	\n\t"/* Im part */\
	/* Sets 5/6: */\
		"vmovaps	 0x200(%%rax),%%zmm0	\n\t	vmovaps	 0x280(%%rax),%%zmm2		\n\t"\
		"vmovaps	 0x240(%%rax),%%zmm1	\n\t	vmovaps	 0x2c0(%%rax),%%zmm3		\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm11,%%zmm5,%%zmm5	\n\t	vmulpd		%%zmm11,%%zmm7,%%zmm7	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
		"vmulpd		%%zmm11,%%zmm4,%%zmm4	\n\t	vmulpd		%%zmm11,%%zmm6,%%zmm6	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm0,%%zmm0	\n\t	vsubpd		%%zmm7 ,%%zmm2,%%zmm2	\n\t"\
		"vaddpd		%%zmm4 ,%%zmm1,%%zmm1	\n\t	vaddpd		%%zmm6 ,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0 ,0x200(%%rax)	\n\t	vmovaps		%%zmm2 ,0x280(%%rax)	\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1 ,0x240(%%rax)	\n\t	vmovaps		%%zmm3 ,0x2c0(%%rax)	\n\t"/* Im part */\
	/* Sets 7/8: */\
		"vmovaps	 0x300(%%rax),%%zmm0	\n\t	vmovaps	 0x380(%%rax),%%zmm2		\n\t"\
		"vmovaps	 0x340(%%rax),%%zmm1	\n\t	vmovaps	 0x3c0(%%rax),%%zmm3		\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm11,%%zmm5,%%zmm5	\n\t	vmulpd		%%zmm11,%%zmm7,%%zmm7	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
		"vmulpd		%%zmm11,%%zmm4,%%zmm4	\n\t	vmulpd		%%zmm11,%%zmm6,%%zmm6	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm0,%%zmm0	\n\t	vsubpd		%%zmm7 ,%%zmm2,%%zmm2	\n\t"\
		"vaddpd		%%zmm4 ,%%zmm1,%%zmm1	\n\t	vaddpd		%%zmm6 ,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0 ,0x300(%%rax)	\n\t	vmovaps		%%zmm2 ,0x380(%%rax)	\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1 ,0x340(%%rax)	\n\t	vmovaps		%%zmm3 ,0x3c0(%%rax)	\n\t"/* Im part */\
	/* Apply inverse-complex-runlength scaling factor to the data as we read them in below in preparation for the 8x8 transpose: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	0x080(%%rdx),%%zmm18	\n\t"/* zmm18 = scale, discarded after MULs below */\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0/1,2/3,4/5,6/7,8/9,a/b,c/d,e/f. Outputs into zmm0/1,2/3,4/5,6/7,8/9,a/b,c/d,e/f: */\
		"movq		%[__data],%%rax		\n\t"\
	/* Real parts use zmm0,2,4,6,8,10,12,14,16:				Imag parts use zmm1,3,5,7,9,11,13,15,17: */\
		/* Read in the 8 rows of our input matrix ... the funky index ordering gives ordered outputs sans reg-copying: */\
		"vmulpd	0x000(%%rax),%%zmm18,%%zmm6 		\n\t	vmulpd	0x040(%%rax),%%zmm18,%%zmm7 		\n\t"\
		"vmulpd	0x080(%%rax),%%zmm18,%%zmm8 		\n\t	vmulpd	0x0c0(%%rax),%%zmm18,%%zmm9 		\n\t"\
		"vmulpd	0x100(%%rax),%%zmm18,%%zmm10		\n\t	vmulpd	0x140(%%rax),%%zmm18,%%zmm11		\n\t"\
		"vmulpd	0x180(%%rax),%%zmm18,%%zmm16		\n\t	vmulpd	0x1c0(%%rax),%%zmm18,%%zmm17		\n\t"\
		"vmulpd	0x200(%%rax),%%zmm18,%%zmm12		\n\t	vmulpd	0x240(%%rax),%%zmm18,%%zmm13		\n\t"\
		"vmulpd	0x280(%%rax),%%zmm18,%%zmm0 		\n\t	vmulpd	0x2c0(%%rax),%%zmm18,%%zmm1 		\n\t"\
		"vmulpd	0x300(%%rax),%%zmm18,%%zmm2 		\n\t	vmulpd	0x340(%%rax),%%zmm18,%%zmm3 		\n\t"\
		"vmulpd	0x380(%%rax),%%zmm18,%%zmm14		\n\t	vmulpd	0x3c0(%%rax),%%zmm18,%%zmm15		\n\t"\
		"\n\t"\
		"vunpcklpd		%%zmm8 ,%%zmm6 ,%%zmm4 		\n\t	vunpcklpd		%%zmm9 ,%%zmm7 ,%%zmm5 		\n\t"\
		"vunpckhpd		%%zmm8 ,%%zmm6 ,%%zmm8 		\n\t	vunpckhpd		%%zmm9 ,%%zmm7 ,%%zmm9 		\n\t"\
		"vunpcklpd		%%zmm16,%%zmm10,%%zmm6 		\n\t	vunpcklpd		%%zmm17,%%zmm11,%%zmm7 		\n\t"\
		"vunpckhpd		%%zmm16,%%zmm10,%%zmm16		\n\t	vunpckhpd		%%zmm17,%%zmm11,%%zmm17		\n\t"\
		"vunpcklpd		%%zmm0 ,%%zmm12,%%zmm10		\n\t	vunpcklpd		%%zmm1 ,%%zmm13,%%zmm11		\n\t"\
		"vunpckhpd		%%zmm0 ,%%zmm12,%%zmm0 		\n\t	vunpckhpd		%%zmm1 ,%%zmm13,%%zmm1 		\n\t"\
		"vunpcklpd		%%zmm14,%%zmm2 ,%%zmm12		\n\t	vunpcklpd		%%zmm15,%%zmm3 ,%%zmm13		\n\t"\
		"vunpckhpd		%%zmm14,%%zmm2 ,%%zmm14		\n\t	vunpckhpd		%%zmm15,%%zmm3 ,%%zmm15		\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm6 ,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm7 ,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm6 ,%%zmm4 ,%%zmm6 	\n\t	vshuff64x2	$221,%%zmm7 ,%%zmm5 ,%%zmm7 	\n\t"\
		"vshuff64x2	$136,%%zmm16,%%zmm8 ,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm17,%%zmm9 ,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm16,%%zmm8 ,%%zmm16	\n\t	vshuff64x2	$221,%%zmm17,%%zmm9 ,%%zmm17	\n\t"\
		"vshuff64x2	$136,%%zmm12,%%zmm10,%%zmm8 	\n\t	vshuff64x2	$136,%%zmm13,%%zmm11,%%zmm9 	\n\t"\
		"vshuff64x2	$221,%%zmm12,%%zmm10,%%zmm12	\n\t	vshuff64x2	$221,%%zmm13,%%zmm11,%%zmm13	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm0 ,%%zmm10	\n\t	vshuff64x2	$136,%%zmm15,%%zmm1 ,%%zmm11	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm0 ,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm1 ,%%zmm15	\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm2 ,%%zmm0 	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm3 ,%%zmm1 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm2 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm3 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm10,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm11,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm10,%%zmm4 ,%%zmm10	\n\t	vshuff64x2	$221,%%zmm11,%%zmm5 ,%%zmm11	\n\t"\
		"vshuff64x2	$136,%%zmm12,%%zmm6 ,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm13,%%zmm7 ,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm12,%%zmm6 ,%%zmm12	\n\t	vshuff64x2	$221,%%zmm13,%%zmm7 ,%%zmm13	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm16,%%zmm6 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm17,%%zmm7 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm16,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm17,%%zmm15	\n\t"\
		/* Outputs are now ordered - leave in registers. */\
		/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movq	%[__base_root] ,%%rax		\n\t"/* Base negacyclic roots at this address in [0,2,...,14]*0x040 (Re parts), [1,3,...,15]*0x040 (Imag parts) */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	      (%%rdx),%%zmm20	\n\t"/* [base0-7] */\
		"vmovaps	 0x040(%%rdx),%%zmm21	\n\t"/* [baseinv0-7] */\
		"vmovaps	-0x080(%%rdx),%%zmm22	\n\t"/* zmm21 = maxerr */\
		"vmovaps	%%zmm22,%%zmm23			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
		"movq		%[__cy_im],%%rcx		\n\t"\
	/* Do a-quartet: Data in zmm0 ,zmm1 : */\
	"prefetcht0	(%%r14)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	     (%%rax),%%zmm18	\n\t"/* c = Re part of 1st base-root quartet */\
		"vmovaps	0x040(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm0 ,zmm1; Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm19,%%zmm0 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm1 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm0 	\n\t"/* [a0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm1 	\n\t"/* [a0-7.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%zmm0 ,%%zmm16			\n\t	vmovaps		%%zmm1 ,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm0 ,%%zmm0 	\n\t	vrndscalepd	$0,%%zmm1 ,%%zmm1 		\n\t"/* temp = DNINT(x|y) */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"vsubpd		%%zmm0 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm1 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm0	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm1	\n\t"/* temp = temp*prp_mult + cy [register args as they appear in left-to-right order: cy,prp_mult,temp] */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm0 ,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm1 ,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm0 	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm1 	\n\t"/* xmm0|1 = [a0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm0 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm1 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm0 	\n\t"/* [a0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm1 	\n\t"/* [a0-7.im] = y*wt_re + x*wt_im */\
	/* Do b-quartet: Data in zmm2 ,zmm3 : */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x080(%%rax),%%zmm18	\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x0c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm2 ,zmm3 */\
		"vmulpd		%%zmm19,%%zmm2 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm3 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm2 	\n\t"/* [b0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm3 	\n\t"/* [b0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm2 ,%%zmm16			\n\t	vmovaps		%%zmm3 ,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm2 ,%%zmm2 	\n\t	vrndscalepd	$0,%%zmm3 ,%%zmm3 		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm2 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm3 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm2	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm2 ,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm3 ,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm2 	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm3 	\n\t"/* xmm0|1 = [b0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm2 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm3 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm2 	\n\t"/* [b0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm3 	\n\t"/* [b0-7.im] = y*wt_re + x*wt_im */\
	/* Do c-quartet: Data in zmm4 ,zmm5 : */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x100(%%rax),%%zmm18	\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0x140(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm4 ,zmm5 */\
		"vmulpd		%%zmm19,%%zmm4 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm5 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm4 	\n\t"/* [c0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm5 	\n\t"/* [c0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm4 ,%%zmm16			\n\t	vmovaps		%%zmm5 ,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm4 ,%%zmm4 	\n\t	vrndscalepd	$0,%%zmm5 ,%%zmm5 		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm4 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm5 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm4	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm4 ,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm5 ,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm4 	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm5 	\n\t"/* xmm0|1 = [c0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm4 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm5 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm4 	\n\t"/* [c0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm5 	\n\t"/* [c0-7.im] = y*wt_re + x*wt_im */\
	/* Do d-quartet: Data in zmm6 ,zmm7 : */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x180(%%rax),%%zmm18	\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0x1c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm6 ,zmm7 */\
		"vmulpd		%%zmm19,%%zmm6 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm7 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm6 	\n\t"/* [d0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm7 	\n\t"/* [d0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm6 ,%%zmm16			\n\t	vmovaps		%%zmm7 ,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm6 ,%%zmm6 	\n\t	vrndscalepd	$0,%%zmm7 ,%%zmm7 		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm6 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm7 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm6	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm6 ,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm7 ,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm6 	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm7 	\n\t"/* xmm0|1 = [d0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm6 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm7 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm6 	\n\t"/* [d0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm7 	\n\t"/* [d0-7.im] = y*wt_re + x*wt_im */\
	/* Do e-quartet: Data in zmm8 ,zmm9 : */\
	"movslq		%[__p4],%%r15	\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x200(%%rax),%%zmm18	\n\t"/* c = Re part of 5th base-root quartet */\
		"vmovaps	0x240(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm8 ,zmm9 */\
		"vmulpd		%%zmm19,%%zmm8 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm9 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm8 	\n\t"/* [e0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm9 	\n\t"/* [e0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm8 ,%%zmm16			\n\t	vmovaps		%%zmm9 ,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm8 ,%%zmm8 	\n\t	vrndscalepd	$0,%%zmm9 ,%%zmm9 		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm8 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm9 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm8	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm8 ,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm9 ,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm8 	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm9 	\n\t"/* xmm0|1 = [e0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm8 ,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm9 ,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm8 	\n\t"/* [e0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm9 	\n\t"/* [e0-7.im] = y*wt_re + x*wt_im */\
	/* Do f-quartet: Data in zmm10,zmm11: */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x280(%%rax),%%zmm18	\n\t"/* c = Re part of 6th base-root quartet */\
		"vmovaps	0x2c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm10,zmm11*/\
		"vmulpd		%%zmm19,%%zmm10,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm11,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm10	\n\t"/* [f0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm11	\n\t"/* [f0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm10,%%zmm16			\n\t	vmovaps		%%zmm11,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm10,%%zmm10	\n\t	vrndscalepd	$0,%%zmm11,%%zmm11		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm10,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm11,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm10	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm10,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm11,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm10	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm11	\n\t"/* xmm0|1 = [f0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm10,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm11,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm10	\n\t"/* [f0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm11	\n\t"/* [f0-7.im] = y*wt_re + x*wt_im */\
	/* Do g-quartet: Data in zmm12,zmm13: */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x300(%%rax),%%zmm18	\n\t"/* c = Re part of 7th base-root quartet */\
		"vmovaps	0x340(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm12,zmm13*/\
		"vmulpd		%%zmm19,%%zmm12,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm13,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm12	\n\t"/* [g0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm13	\n\t"/* [g0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm12,%%zmm16			\n\t	vmovaps		%%zmm13,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm12,%%zmm12	\n\t	vrndscalepd	$0,%%zmm13,%%zmm13		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm12,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm13,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm12	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm13	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm12,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm13,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm12	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm13	\n\t"/* xmm0|1 = [g0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm12,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm13,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm12	\n\t"/* [g0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm13	\n\t"/* [g0-7.im] = y*wt_re + x*wt_im */\
	/* Do h-quartet: Data in zmm14,zmm15: */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%zmm23,%%zmm22			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x380(%%rax),%%zmm18	\n\t"/* c = Re part of 8th base-root quartet */\
		"vmovaps	0x3c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		/* Data in zmm14,zmm15*/\
		"vmulpd		%%zmm19,%%zmm14,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm15,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmadd213pd	%%zmm17,%%zmm18,%%zmm14	\n\t"/* [h0-7.re] = x*wt_re + y*wt_im */\
	"vfmsub213pd	%%zmm16,%%zmm18,%%zmm15	\n\t"/* [h0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm14,%%zmm16			\n\t	vmovaps		%%zmm15,%%zmm17			\n\t"/* copy x|y */\
		"vrndscalepd	$0,%%zmm14,%%zmm14	\n\t	vrndscalepd	$0,%%zmm15,%%zmm15		\n\t"/* temp = DNINT(x|y) */\
		"vsubpd		%%zmm14,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm15,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
	"vfmadd213pd	(%%rbx),%%zmm24,%%zmm14	\n\t vfmadd213pd	(%%rcx),%%zmm24,%%zmm15	\n\t"/* temp = temp*prp_mult + cy */\
		"vpandq		(%%rsi),%%zmm16,%%zmm16	\n\t	vpandq		(%%rsi),%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
		"vmaxpd		%%zmm22,%%zmm16,%%zmm22	\n\t	vmaxpd		%%zmm23,%%zmm17,%%zmm23	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmulpd		%%zmm21,%%zmm14,%%zmm16	\n\t	vmulpd		%%zmm21,%%zmm15,%%zmm17	\n\t"/* temp*baseinv[0] */\
		"vmaxpd		%%zmm22,%%zmm23,%%zmm23	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd	$0,%%zmm16,%%zmm16	\n\t	vrndscalepd	$0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,(%%rcx)			\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm20,%%zmm16,%%zmm14	\n\t vfnmadd231pd	%%zmm20,%%zmm17,%%zmm15	\n\t"/* xmm0|1 = [h0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm19,%%zmm14,%%zmm16	\n\t"/* wt_im*[x copy] */\
		"vmulpd		%%zmm19,%%zmm15,%%zmm17	\n\t"/* wt_im*[y copy] */\
	"vfmsub213pd	%%zmm17,%%zmm18,%%zmm14	\n\t"/* [h0-7.re] = x*wt_re - y*wt_im */\
	"vfmadd213pd	%%zmm16,%%zmm18,%%zmm15	\n\t"/* [h0-7.im] = y*wt_re + x*wt_im */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm23,-0x080(%%rdx)	\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately), now using a reg-copy-less algo with ordered *inputs*: */\
		"movq		%[__data],%%rax			\n\t"\
		"vunpcklpd		%%zmm2 ,%%zmm0 ,%%zmm16		\n\t	vunpcklpd		%%zmm3 ,%%zmm1 ,%%zmm17		\n\t"\
		"vunpckhpd		%%zmm2 ,%%zmm0 ,%%zmm2 		\n\t	vunpckhpd		%%zmm3 ,%%zmm1 ,%%zmm3 		\n\t"\
		"vunpcklpd		%%zmm6 ,%%zmm4 ,%%zmm0 		\n\t	vunpcklpd		%%zmm7 ,%%zmm5 ,%%zmm1 		\n\t"\
		"vunpckhpd		%%zmm6 ,%%zmm4 ,%%zmm6 		\n\t	vunpckhpd		%%zmm7 ,%%zmm5 ,%%zmm7 		\n\t"\
		"vunpcklpd		%%zmm10,%%zmm8 ,%%zmm4 		\n\t	vunpcklpd		%%zmm11,%%zmm9 ,%%zmm5 		\n\t"\
		"vunpckhpd		%%zmm10,%%zmm8 ,%%zmm10		\n\t	vunpckhpd		%%zmm11,%%zmm9 ,%%zmm11		\n\t"\
		"vunpcklpd		%%zmm14,%%zmm12,%%zmm8 		\n\t	vunpcklpd		%%zmm15,%%zmm13,%%zmm9 		\n\t"\
		"vunpckhpd		%%zmm14,%%zmm12,%%zmm14		\n\t	vunpckhpd		%%zmm15,%%zmm13,%%zmm15		\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm0 ,%%zmm16,%%zmm12	\n\t	vshuff64x2	$136,%%zmm1 ,%%zmm17,%%zmm13	\n\t"\
		"vshuff64x2	$221,%%zmm0 ,%%zmm16,%%zmm0 	\n\t	vshuff64x2	$221,%%zmm1 ,%%zmm17,%%zmm1 	\n\t"\
		"vshuff64x2	$136,%%zmm6 ,%%zmm2 ,%%zmm16	\n\t	vshuff64x2	$136,%%zmm7 ,%%zmm3 ,%%zmm17 	\n\t"\
		"vshuff64x2	$221,%%zmm6 ,%%zmm2 ,%%zmm6 	\n\t	vshuff64x2	$221,%%zmm7 ,%%zmm3 ,%%zmm7 	\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm4 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm5 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm10,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm11,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm10,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm11,%%zmm15	\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm2 ,%%zmm12,%%zmm10	\n\t	vshuff64x2	$136,%%zmm3 ,%%zmm13,%%zmm11	\n\t"\
		"vshuff64x2	$221,%%zmm2 ,%%zmm12,%%zmm2 	\n\t	vshuff64x2	$221,%%zmm3 ,%%zmm13,%%zmm3 	\n\t"\
		"vshuff64x2	$136,%%zmm4 ,%%zmm16,%%zmm12	\n\t	vshuff64x2	$136,%%zmm5 ,%%zmm17,%%zmm13	\n\t"\
		"vshuff64x2	$221,%%zmm4 ,%%zmm16,%%zmm4 	\n\t	vshuff64x2	$221,%%zmm5 ,%%zmm17,%%zmm5 	\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm0 ,%%zmm16	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm1 ,%%zmm17 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm0 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm1 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm6 ,%%zmm0 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm7 ,%%zmm1 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm6 ,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm7 ,%%zmm15	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm11,0x040(%%rax)	\n\t"\
		"vmovaps	%%zmm12,0x080(%%rax)				\n\t	vmovaps		%%zmm13,0x0c0(%%rax)	\n\t"\
		"vmovaps	%%zmm16,0x100(%%rax)				\n\t	vmovaps		%%zmm17,0x140(%%rax)	\n\t"\
		"vmovaps	%%zmm0 ,0x180(%%rax)				\n\t	vmovaps		%%zmm1 ,0x1c0(%%rax)	\n\t"\
		"vmovaps	%%zmm2 ,0x200(%%rax)				\n\t	vmovaps		%%zmm3 ,0x240(%%rax)	\n\t"\
		"vmovaps	%%zmm4 ,0x280(%%rax)				\n\t	vmovaps		%%zmm5 ,0x2c0(%%rax)	\n\t"\
		"vmovaps	%%zmm8 ,0x300(%%rax)				\n\t	vmovaps		%%zmm9 ,0x340(%%rax)	\n\t"\
		"vmovaps	%%zmm14,0x380(%%rax)				\n\t	vmovaps		%%zmm15,0x3c0(%%rax)	\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cmul_offset] "m" (Xcmul_offset)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"m" (Xcy_im)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Prefetch: base address and index offsets p0-7 = p[0,1,2,3,4+0,4+1,4+2,4+3]: */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		,	[__p4] "m" (Xp4)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15", "xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23","xmm24"	/* Clobbered registers */\
	);\
	}

	#define SSE2_fermat_carry_init_loacc(Xbase_root)\
	{\
	__asm__ volatile (\
		"movq		%[__base_root] ,%%rax	\n\t	"/* Base negacyclic roots at this address +16*0x40 (Re parts), +17*0x40 (Im parts) */\
		"vmovaps	0x400(%%rax),%%zmm10	\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-7 */\
		"vmovaps	0x440(%%rax),%%zmm11	\n\t	"/* c = Re(exp) in zmm0, s = Im(exp) in zmm1 */\
	/* base-root octets 0,1: */\
		"vmovaps	     (%%rax),%%zmm0		\n\t	vmovaps	0x080(%%rax),%%zmm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	0x040(%%rax),%%zmm1		\n\t	vmovaps	0x0c0(%%rax),%%zmm3			\n\t"/* y = Im part */\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"/* Copy x */\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"/* Copy y */\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"/* c.x */\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"/* c.y */\
	"vfnmadd231pd	%%zmm11,%%zmm5,%%zmm0 	\n\t vfnmadd231pd	%%zmm11,%%zmm7,%%zmm2	\n\t"/* Out.re = c.x - s.y */\
	" vfmadd231pd	%%zmm11,%%zmm4,%%zmm1 	\n\t  vfmadd231pd	%%zmm11,%%zmm6,%%zmm3	\n\t"/* Out.im = c.y + s.x */\
		"vmovaps	%%zmm0,     (%%rax)		\n\t	vmovaps		%%zmm2,0x080(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1,0x040(%%rax)		\n\t	vmovaps		%%zmm3,0x0c0(%%rax)		\n\t"/* Im part */\
	/* base-root octets 2,3: */\
		"vmovaps	0x100(%%rax),%%zmm0		\n\t	vmovaps	0x180(%%rax),%%zmm2			\n\t"\
		"vmovaps	0x140(%%rax),%%zmm1		\n\t	vmovaps	0x1c0(%%rax),%%zmm3			\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
	"vfnmadd231pd	%%zmm11,%%zmm5,%%zmm0 	\n\t vfnmadd231pd	%%zmm11,%%zmm7,%%zmm2	\n\t"\
	" vfmadd231pd	%%zmm11,%%zmm4,%%zmm1 	\n\t  vfmadd231pd	%%zmm11,%%zmm6,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x100(%%rax)		\n\t	vmovaps		%%zmm2,0x180(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1,0x140(%%rax)		\n\t	vmovaps		%%zmm3,0x1c0(%%rax)		\n\t"/* Im part */\
	/* base-root octets 4,5: */\
		"vmovaps	0x200(%%rax),%%zmm0		\n\t	vmovaps	0x280(%%rax),%%zmm2			\n\t"\
		"vmovaps	0x240(%%rax),%%zmm1		\n\t	vmovaps	0x2c0(%%rax),%%zmm3			\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
	"vfnmadd231pd	%%zmm11,%%zmm5,%%zmm0 	\n\t vfnmadd231pd	%%zmm11,%%zmm7,%%zmm2	\n\t"\
	" vfmadd231pd	%%zmm11,%%zmm4,%%zmm1 	\n\t  vfmadd231pd	%%zmm11,%%zmm6,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x200(%%rax)		\n\t	vmovaps		%%zmm2,0x280(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1,0x240(%%rax)		\n\t	vmovaps		%%zmm3,0x2c0(%%rax)		\n\t"/* Im part */\
	/* base-root octets 6,7: */\
		"vmovaps	0x300(%%rax),%%zmm0		\n\t	vmovaps	0x380(%%rax),%%zmm2			\n\t"\
		"vmovaps	0x340(%%rax),%%zmm1		\n\t	vmovaps	0x3c0(%%rax),%%zmm3			\n\t"\
		"vmovaps	%%zmm0,%%zmm4			\n\t	vmovaps		%%zmm2,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm5			\n\t	vmovaps		%%zmm3,%%zmm7			\n\t"\
		"vmulpd		%%zmm10,%%zmm0,%%zmm0	\n\t	vmulpd		%%zmm10,%%zmm2,%%zmm2	\n\t"\
		"vmulpd		%%zmm10,%%zmm1,%%zmm1	\n\t	vmulpd		%%zmm10,%%zmm3,%%zmm3	\n\t"\
	"vfnmadd231pd	%%zmm11,%%zmm5,%%zmm0 	\n\t vfnmadd231pd	%%zmm11,%%zmm7,%%zmm2	\n\t"\
	" vfmadd231pd	%%zmm11,%%zmm4,%%zmm1 	\n\t  vfmadd231pd	%%zmm11,%%zmm6,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x300(%%rax)		\n\t	vmovaps		%%zmm2,0x380(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%zmm1,0x340(%%rax)		\n\t	vmovaps		%%zmm3,0x3c0(%%rax)		\n\t"/* Im part */\
	:						/* outputs: none */\
	:	[__base_root]	"m" (Xbase_root)	/* All inputs from memory addresses here */\
		: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm10","xmm11"   /* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength 8-way Fermat-mod acyclic-transform/IBDWT carry macro
		(based on AVX2 version of SSE2_fermat_carry_norm_errcheck_X4_loacc).
	The array indices i/j/k/lcycle declared int in caller but assumed to have been byte-shift-converted at time this macro called,
	thus can use as complex-address-offsets.  Use bytewise literal offsets to save registers for several args here,as vvv-marked:
																			 vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv */
	#define SSE2_fermat_carry_norm_errcheck_X8_loacc(Xdata,Xbase_root,Xcy_re,Xcy_im,Xodd_radix,Xodd_radm2,Xodd_radm3,Xhalf_arr,Xsign_mask,\
										XicycleA,XicycleB,XicycleC,XicycleD,XicycleE,XicycleF,XicycleG,XicycleH\
												,XjcycleA,XkcycleA,XlcycleA,XmcycleA,XncycleA,XocycleA,XpcycleA, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%zmm24	\n\t"/* prp_mult, broadcast to all double-slots of zmm24 */\
		"movq		%[__add0],%%rcx			\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax			\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x80(%%rdx),%%zmm21	\n\t"/* xmm21 = maxerr */\
		"addq		$%c[__odd_radix],%%rdx	\n\t"/* wt|wtinv|base|baseinv data offset by icycle array slots from resp. base addresses */\
		/* Multiply complex transform outputs [x,y] = [re,im] by inverse IBDWT weights, which include the 2/n scale factor;
		Store the weights in the registers which will hold the Im-parts below - cf. note in ensuing transpose section re. funky indexing: */\
		"movslq		%[__icycleA],%%r8 		\n\t	vmovaps	(%%rdx,%%r8 ),%%zmm7 	\n\t"/* [wtinv0-7]A */\
		"movslq		%[__icycleB],%%r9 		\n\t	vmovaps	(%%rdx,%%r9 ),%%zmm9 	\n\t"/* [wtinv0-7]B */\
		"movslq		%[__icycleC],%%r10		\n\t	vmovaps	(%%rdx,%%r10),%%zmm11	\n\t"/* [wtinv0-7]C */\
		"movslq		%[__icycleD],%%r11		\n\t	vmovaps	(%%rdx,%%r11),%%zmm17	\n\t"/* [wtinv0-7]D */\
		"movslq		%[__icycleE],%%r12		\n\t	vmovaps	(%%rdx,%%r12),%%zmm13	\n\t"/* [wtinv0-7]E */\
		"movslq		%[__icycleF],%%r13		\n\t	vmovaps	(%%rdx,%%r13),%%zmm1 	\n\t"/* [wtinv0-7]F */\
		"movslq		%[__icycleG],%%r14		\n\t	vmovaps	(%%rdx,%%r14),%%zmm3 	\n\t"/* [wtinv0-7]G */\
		"movslq		%[__icycleH],%%r15		\n\t	vmovaps	(%%rdx,%%r15),%%zmm15	\n\t"/* [wtinv0-7]H */\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0/1,2/3,4/5,6/7,8/9,a/b,c/d,e/f. Outputs into zmm0/1,2/3,4/5,6/7,8/9,a/b,c/d,e/f: */\
		"movq		%[__data],%%rax		\n\t"\
	/* Real parts use zmm0,2,4,6,8,10,12,14,16:				Imag parts use zmm1,3,5,7,9,11,13,15,17: */\
		/* Read in the 8 rows of our input matrix ... the funky index ordering gives ordered outputs sans reg-copying: */\
		"vmulpd	0x000(%%rax),%%zmm7 ,%%zmm6 		\n\t	vmulpd	0x040(%%rax),%%zmm7 ,%%zmm7 		\n\t"\
		"vmulpd	0x080(%%rax),%%zmm9 ,%%zmm8 		\n\t	vmulpd	0x0c0(%%rax),%%zmm9 ,%%zmm9 		\n\t"\
		"vmulpd	0x100(%%rax),%%zmm11,%%zmm10		\n\t	vmulpd	0x140(%%rax),%%zmm11,%%zmm11		\n\t"\
		"vmulpd	0x180(%%rax),%%zmm17,%%zmm16		\n\t	vmulpd	0x1c0(%%rax),%%zmm17,%%zmm17		\n\t"\
		"vmulpd	0x200(%%rax),%%zmm13,%%zmm12		\n\t	vmulpd	0x240(%%rax),%%zmm13,%%zmm13		\n\t"\
		"vmulpd	0x280(%%rax),%%zmm1 ,%%zmm0 		\n\t	vmulpd	0x2c0(%%rax),%%zmm1 ,%%zmm1 		\n\t"\
		"vmulpd	0x300(%%rax),%%zmm3 ,%%zmm2 		\n\t	vmulpd	0x340(%%rax),%%zmm3 ,%%zmm3 		\n\t"\
		"vmulpd	0x380(%%rax),%%zmm15,%%zmm14		\n\t	vmulpd	0x3c0(%%rax),%%zmm15,%%zmm15		\n\t"\
		"\n\t"\
		"vunpcklpd		%%zmm8 ,%%zmm6 ,%%zmm4 		\n\t	vunpcklpd		%%zmm9 ,%%zmm7 ,%%zmm5 		\n\t"\
		"vunpckhpd		%%zmm8 ,%%zmm6 ,%%zmm8 		\n\t	vunpckhpd		%%zmm9 ,%%zmm7 ,%%zmm9 		\n\t"\
		"vunpcklpd		%%zmm16,%%zmm10,%%zmm6 		\n\t	vunpcklpd		%%zmm17,%%zmm11,%%zmm7 		\n\t"\
		"vunpckhpd		%%zmm16,%%zmm10,%%zmm16		\n\t	vunpckhpd		%%zmm17,%%zmm11,%%zmm17		\n\t"\
		"vunpcklpd		%%zmm0 ,%%zmm12,%%zmm10		\n\t	vunpcklpd		%%zmm1 ,%%zmm13,%%zmm11		\n\t"\
		"vunpckhpd		%%zmm0 ,%%zmm12,%%zmm0 		\n\t	vunpckhpd		%%zmm1 ,%%zmm13,%%zmm1 		\n\t"\
		"vunpcklpd		%%zmm14,%%zmm2 ,%%zmm12		\n\t	vunpcklpd		%%zmm15,%%zmm3 ,%%zmm13		\n\t"\
		"vunpckhpd		%%zmm14,%%zmm2 ,%%zmm14		\n\t	vunpckhpd		%%zmm15,%%zmm3 ,%%zmm15		\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm6 ,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm7 ,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm6 ,%%zmm4 ,%%zmm6 	\n\t	vshuff64x2	$221,%%zmm7 ,%%zmm5 ,%%zmm7 	\n\t"\
		"vshuff64x2	$136,%%zmm16,%%zmm8 ,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm17,%%zmm9 ,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm16,%%zmm8 ,%%zmm16	\n\t	vshuff64x2	$221,%%zmm17,%%zmm9 ,%%zmm17	\n\t"\
		"vshuff64x2	$136,%%zmm12,%%zmm10,%%zmm8 	\n\t	vshuff64x2	$136,%%zmm13,%%zmm11,%%zmm9 	\n\t"\
		"vshuff64x2	$221,%%zmm12,%%zmm10,%%zmm12	\n\t	vshuff64x2	$221,%%zmm13,%%zmm11,%%zmm13	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm0 ,%%zmm10	\n\t	vshuff64x2	$136,%%zmm15,%%zmm1 ,%%zmm11	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm0 ,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm1 ,%%zmm15	\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm2 ,%%zmm0 	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm3 ,%%zmm1 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm2 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm3 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm10,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm11,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm10,%%zmm4 ,%%zmm10	\n\t	vshuff64x2	$221,%%zmm11,%%zmm5 ,%%zmm11	\n\t"\
		"vshuff64x2	$136,%%zmm12,%%zmm6 ,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm13,%%zmm7 ,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm12,%%zmm6 ,%%zmm12	\n\t	vshuff64x2	$221,%%zmm13,%%zmm7 ,%%zmm13	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm16,%%zmm6 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm17,%%zmm7 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm16,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm17,%%zmm15	\n\t"\
		/* Outputs are now ordered - leave in registers. */\
		/* Base negacyclic roots at this address in [0,2,4,6,8,a,c,e]*0x40 (Re parts), [1,3,5,7,9,b,d,f]*0x40 (Imag parts) */\
		"movq		%[__sign_mask],%%rax	\n\t"\
		"vmovaps	(%%rax),%%zmm23			\n\t"/* zmm23: sign_mask needed for floating ABS */\
		"movq		%[__base_root],%%rax	\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movq		%[__cy_re],%%rbx		\n\t"\
	/* Do a-octet: Data in zmm0,zmm1 */\
	"prefetcht0	(%%rcx)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	     (%%rax),%%zmm18	\n\t"/* c = Re part of 1st base-root octet */\
		"vmovaps	0x040(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm21,%%zmm22			\n\t"/* maxerr copy */\
		"movslq		%[__icycleA],%%rdx		\n\t"/* icycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm0,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm1,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm0,%%zmm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm1,%%zmm1	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm0 	\n\t"/* wt_im*[y copy] ...[a0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm1 	\n\t"/* wt_im*[x copy] ...[a0-7.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%zmm0,%%zmm16			\n\t	vmovaps		%%zmm1,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm0,%%zmm0		\n\t	vrndscalepd $0,%%zmm1,%%zmm1	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm0 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm1 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm0	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm1	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm0 ,%%zmm16			\n\t	vmovaps		%%zmm1 ,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm0 	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm1	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [a0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm0 ,%%zmm16			\n\t	vmovaps		%%zmm1 ,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm0,%%zmm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm1,%%zmm1	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm0 	\n\t"/* wt_im*[y copy] ... [a0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm1 	\n\t"/* wt_im*[x copy] ... [a0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x040(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,     (%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do b-octet: Data in zmm2,zmm3 */\
	"movslq		%[__p1],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x080(%%rax),%%zmm18	\n\t"/* c = Re part of 2nd base-root octet */\
		"vmovaps	0x0c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__jcycleA],%%rdx		\n\t"/* jcycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm2 ,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm3 ,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm2 ,%%zmm2 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm3 ,%%zmm3 	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm2 	\n\t"/* wt_im*[y copy] ...[a0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm3 	\n\t"/* wt_im*[x copy] ...[a0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm2 ,%%zmm16			\n\t	vmovaps		%%zmm3 ,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm2 ,%%zmm2 		\n\t	vrndscalepd $0,%%zmm3 ,%%zmm3 	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm2 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm3 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm2	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm2 ,%%zmm16			\n\t	vmovaps		%%zmm3 ,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm2 	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm3 	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [a0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm2 ,%%zmm16			\n\t	vmovaps		%%zmm3 ,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm2 ,%%zmm2 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm3 ,%%zmm3 	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm2 	\n\t"/* wt_im*[y copy] ... [a0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm3 	\n\t"/* wt_im*[x copy] ... [a0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x0c0(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x080(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do c-octet: Data in zmm4,zmm5 */\
	"movslq		%[__p2],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x100(%%rax),%%zmm18	\n\t"/* c = Re part of 3rd base-root octet */\
		"vmovaps	0x140(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__kcycleA],%%rdx		\n\t"/* kcycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm4 ,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm5 ,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm4 ,%%zmm4 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm5 ,%%zmm5 	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm4 	\n\t"/* wt_im*[y copy] ...[a0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm5 	\n\t"/* wt_im*[x copy] ...[a0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm4 ,%%zmm16			\n\t	vmovaps		%%zmm5 ,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm4 ,%%zmm4 		\n\t	vrndscalepd $0,%%zmm5 ,%%zmm5 	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm4 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm5 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm4	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm4 ,%%zmm16			\n\t	vmovaps		%%zmm5 ,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm4 	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm5 	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [a0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm4 ,%%zmm16			\n\t	vmovaps		%%zmm5 ,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm4 ,%%zmm4 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm5 ,%%zmm5 	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm4 	\n\t"/* wt_im*[y copy] ... [a0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm5 	\n\t"/* wt_im*[x copy] ... [a0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x140(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x100(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do d-octet: Data in zmm6,zmm7 */\
	"movslq		%[__p3],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x180(%%rax),%%zmm18	\n\t"/* c = Re part of 4th base-root octet */\
		"vmovaps	0x1c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__lcycleA],%%rdx		\n\t"/* lcycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm6 ,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm7 ,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm6 ,%%zmm6 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm7 ,%%zmm7 	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm6 	\n\t"/* wt_im*[y copy] ...[a0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm7 	\n\t"/* wt_im*[x copy] ...[a0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm6 ,%%zmm16			\n\t	vmovaps		%%zmm7 ,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm6 ,%%zmm6 		\n\t	vrndscalepd $0,%%zmm7 ,%%zmm7 	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm6 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm7 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm6	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm6 ,%%zmm16			\n\t	vmovaps		%%zmm7 ,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm6 	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm7 	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [a0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm6 ,%%zmm16			\n\t	vmovaps		%%zmm7 ,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm6 ,%%zmm6 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm7 ,%%zmm7 	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm6 	\n\t"/* wt_im*[y copy] ... [a0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm7 	\n\t"/* wt_im*[x copy] ... [a0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x1c0(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x180(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do e-octet: Data in zmm8,zmm9 */\
	"movslq		%[__p4],%%rdx	\n\t"\
	"leaq (%%rcx,%%rdx,8),%%rcx	\n\t"\
	"prefetcht0	(%%rcx)			\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x200(%%rax),%%zmm18	\n\t"/* c = Re part of 5th base-root octet */\
		"vmovaps	0x240(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__mcycleA],%%rdx		\n\t"/* mcycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm8 ,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm9 ,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm8 ,%%zmm8 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm9 ,%%zmm9 	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm8 	\n\t"/* wt_im*[y copy] ...[e0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm9 	\n\t"/* wt_im*[x copy] ...[e0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm8 ,%%zmm16			\n\t	vmovaps		%%zmm9 ,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm8 ,%%zmm8 		\n\t	vrndscalepd $0,%%zmm9 ,%%zmm9 	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm8 ,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm9 ,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm8	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm8 ,%%zmm16			\n\t	vmovaps		%%zmm9 ,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm8 	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm9 	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [e0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm8 ,%%zmm16			\n\t	vmovaps		%%zmm9 ,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm8 ,%%zmm8 	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm9 ,%%zmm9 	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm8 	\n\t"/* wt_im*[y copy] ... [e0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm9 	\n\t"/* wt_im*[x copy] ... [e0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x240(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x200(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do f-octet: Data in zmm10,zmm11 */\
	"movslq		%[__p1],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x280(%%rax),%%zmm18	\n\t"/* c = Re part of 6th base-root octet */\
		"vmovaps	0x2c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__ncycleA],%%rdx		\n\t"/* ncycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm10,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm11,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm10,%%zmm10	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm11,%%zmm11	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm10	\n\t"/* wt_im*[y copy] ...[f0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm11	\n\t"/* wt_im*[x copy] ...[f0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm10,%%zmm16			\n\t	vmovaps		%%zmm11,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm10,%%zmm10		\n\t	vrndscalepd $0,%%zmm11,%%zmm11	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm10,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm11,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm10	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm10,%%zmm16			\n\t	vmovaps		%%zmm11,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm10	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm11	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [f0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm10,%%zmm16			\n\t	vmovaps		%%zmm11,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm10,%%zmm10	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm11,%%zmm11	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm10	\n\t"/* wt_im*[y copy] ... [f0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm11	\n\t"/* wt_im*[x copy] ... [f0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x2c0(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x280(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do g-octet: Data in zmm12,zmm13 */\
	"movslq		%[__p2],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x300(%%rax),%%zmm18	\n\t"/* c = Re part of 7th base-root octet */\
		"vmovaps	0x340(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__ocycleA],%%rdx		\n\t"/* ocycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm12,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm13,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm12,%%zmm12	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm13,%%zmm13	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm12	\n\t"/* wt_im*[y copy] ...[g0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm13	\n\t"/* wt_im*[x copy] ...[g0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm12,%%zmm16			\n\t	vmovaps		%%zmm13,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm12,%%zmm12		\n\t	vrndscalepd $0,%%zmm13,%%zmm13	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm12,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm13,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm12	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm13	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm12,%%zmm16			\n\t	vmovaps		%%zmm13,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm12	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm13	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [g0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm12,%%zmm16			\n\t	vmovaps		%%zmm13,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm12,%%zmm12	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm13,%%zmm13	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm12	\n\t"/* wt_im*[y copy] ... [g0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm13	\n\t"/* wt_im*[x copy] ... [g0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x340(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x300(%%rax)	\n\t"/* Store result, overwriting the old base root */\
	/* Now do h-octet: Data in zmm14,zmm15 */\
	"movslq		%[__p3],%%rdx	\n\t"\
	"prefetcht0	(%%rcx,%%rdx,8)	\n\t"\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x380(%%rax),%%zmm18	\n\t"/* c = Re part of 8th base-root octet */\
		"vmovaps	0x3c0(%%rax),%%zmm19	\n\t"/* s = Im part */\
		"vmovaps	%%zmm22,%%zmm21			\n\t"/* maxerr copy */\
		"movslq		%[__pcycleA],%%rdx		\n\t"/* pcycle assumed already in left-shifted ptr-byte-offset form */\
		"addq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm14,%%zmm16			\n\t"/* x copy */\
		"vmovaps	%%zmm15,%%zmm17			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%zmm18,%%zmm14,%%zmm14	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm15,%%zmm15	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%zmm19,%%zmm17,%%zmm14	\n\t"/* wt_im*[y copy] ...[h0-7.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%zmm19,%%zmm16,%%zmm15	\n\t"/* wt_im*[x copy] ...[h0-7.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%zmm14,%%zmm16			\n\t	vmovaps		%%zmm15,%%zmm17		\n\t"/* copy x|y */\
		"vrndscalepd $0,%%zmm14,%%zmm14		\n\t	vrndscalepd $0,%%zmm15,%%zmm15	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%zmm20	\n\t"/* [baseinv0-7] */\
		"vsubpd		%%zmm14,%%zmm16,%%zmm16	\n\t	vsubpd		%%zmm15,%%zmm17,%%zmm17	\n\t"/* frac = [x - temp] */\
		"vpandq		%%zmm23,%%zmm16,%%zmm16	\n\t	vpandq		%%zmm23,%%zmm17,%%zmm17	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%zmm24,%%zmm14	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%zmm24,%%zmm15	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm21,%%zmm16,%%zmm21	\n\t	vmaxpd		%%zmm22,%%zmm17,%%zmm22	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%zmm14,%%zmm16			\n\t	vmovaps		%%zmm15,%%zmm17			\n\t"/* cpy temp */\
		"vmulpd		%%zmm20,%%zmm16,%%zmm16	\n\t	vmulpd		%%zmm20,%%zmm17,%%zmm17	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%zmm21,%%zmm22,%%zmm22	\n\t"/* merge re|im maxerr vectors */\
		"vrndscalepd $0,%%zmm16,%%zmm16		\n\t	vrndscalepd $0,%%zmm17,%%zmm17		\n\t"/* [cy0-7.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%zmm21	\n\t"/* [base0-7] */\
		"vmovaps	%%zmm16,(%%rbx)			\n\t	vmovaps		%%zmm17,%c[__cy_im](%%rbx)\n\t"/* store [cy0-7.re|im] */\
	"vfnmadd231pd	%%zmm21,%%zmm16,%%zmm14	\n\t vfnmadd231pd	%%zmm21,%%zmm17,%%zmm15	\n\t"/* base[0]*[cy0-7.re|im] ... xmm0|1 = [h0-7.re|im] = temp - [cy0-7.re|im]*base[0] */\
		"vmovaps	%%zmm14,%%zmm16			\n\t	vmovaps		%%zmm15,%%zmm17			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%zmm18,%%zmm14,%%zmm14	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%zmm18,%%zmm15,%%zmm15	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm14	\n\t"/* wt_im*[y copy] ... [h0-7.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm15	\n\t"/* wt_im*[x copy] ... [h0-7.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in zmm18,19 by exp(j*I*Pi/2)/RADIX, for j = 8 */\
		"vmovaps	0x480(%%rax),%%zmm16	\n\t"/* x = Re(exp) in zmm18 */\
		"vmovaps	0x4c0(%%rax),%%zmm17	\n\t"/* y = Im(exp) in zmm19 */\
		"vmulpd		%%zmm18,%%zmm17,%%zmm20	\n\t"/* zmm20 = c.y */\
		"vmulpd		%%zmm18,%%zmm16,%%zmm18	\n\t"/* zmm18 = c.x */\
	" vfmadd231pd	%%zmm19,%%zmm16,%%zmm20	\n\t"/* zmm16 = s.x ... zmm19 = wt.im = s.x + c.y */\
	"vfnmadd231pd	%%zmm19,%%zmm17,%%zmm18	\n\t"/* zmm17 = s.y ... zmm18 = wt.re = c.x - s.y */\
		"vmovaps	%%zmm20,0x3c0(%%rax)	\n\t"/* Im part */\
		"vmovaps	%%zmm18,0x380(%%rax)	\n\t"/* Store result, overwriting the old base root */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%zmm22,-0x80(%%rdx)	\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately), now using a reg-copy-less algo with ordered *inputs*: */\
		"movq		%[__data],%%rax			\n\t"\
		"vunpcklpd		%%zmm2 ,%%zmm0 ,%%zmm16		\n\t	vunpcklpd		%%zmm3 ,%%zmm1 ,%%zmm17		\n\t"\
		"vunpckhpd		%%zmm2 ,%%zmm0 ,%%zmm2 		\n\t	vunpckhpd		%%zmm3 ,%%zmm1 ,%%zmm3 		\n\t"\
		"vunpcklpd		%%zmm6 ,%%zmm4 ,%%zmm0 		\n\t	vunpcklpd		%%zmm7 ,%%zmm5 ,%%zmm1 		\n\t"\
		"vunpckhpd		%%zmm6 ,%%zmm4 ,%%zmm6 		\n\t	vunpckhpd		%%zmm7 ,%%zmm5 ,%%zmm7 		\n\t"\
		"vunpcklpd		%%zmm10,%%zmm8 ,%%zmm4 		\n\t	vunpcklpd		%%zmm11,%%zmm9 ,%%zmm5 		\n\t"\
		"vunpckhpd		%%zmm10,%%zmm8 ,%%zmm10		\n\t	vunpckhpd		%%zmm11,%%zmm9 ,%%zmm11		\n\t"\
		"vunpcklpd		%%zmm14,%%zmm12,%%zmm8 		\n\t	vunpcklpd		%%zmm15,%%zmm13,%%zmm9 		\n\t"\
		"vunpckhpd		%%zmm14,%%zmm12,%%zmm14		\n\t	vunpckhpd		%%zmm15,%%zmm13,%%zmm15		\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm0 ,%%zmm16,%%zmm12	\n\t	vshuff64x2	$136,%%zmm1 ,%%zmm17,%%zmm13	\n\t"\
		"vshuff64x2	$221,%%zmm0 ,%%zmm16,%%zmm0 	\n\t	vshuff64x2	$221,%%zmm1 ,%%zmm17,%%zmm1 	\n\t"\
		"vshuff64x2	$136,%%zmm6 ,%%zmm2 ,%%zmm16	\n\t	vshuff64x2	$136,%%zmm7 ,%%zmm3 ,%%zmm17 	\n\t"\
		"vshuff64x2	$221,%%zmm6 ,%%zmm2 ,%%zmm6 	\n\t	vshuff64x2	$221,%%zmm7 ,%%zmm3 ,%%zmm7 	\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm4 ,%%zmm2 	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm5 ,%%zmm3 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm4 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm5 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm10,%%zmm4 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm11,%%zmm5 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm10,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm11,%%zmm15	\n\t"\
		"\n\t"\
		"vshuff64x2	$136,%%zmm2 ,%%zmm12,%%zmm10	\n\t	vshuff64x2	$136,%%zmm3 ,%%zmm13,%%zmm11	\n\t"\
		"vshuff64x2	$221,%%zmm2 ,%%zmm12,%%zmm2 	\n\t	vshuff64x2	$221,%%zmm3 ,%%zmm13,%%zmm3 	\n\t"\
		"vshuff64x2	$136,%%zmm4 ,%%zmm16,%%zmm12	\n\t	vshuff64x2	$136,%%zmm5 ,%%zmm17,%%zmm13	\n\t"\
		"vshuff64x2	$221,%%zmm4 ,%%zmm16,%%zmm4 	\n\t	vshuff64x2	$221,%%zmm5 ,%%zmm17,%%zmm5 	\n\t"\
		"vshuff64x2	$136,%%zmm8 ,%%zmm0 ,%%zmm16	\n\t	vshuff64x2	$136,%%zmm9 ,%%zmm1 ,%%zmm17 	\n\t"\
		"vshuff64x2	$221,%%zmm8 ,%%zmm0 ,%%zmm8 	\n\t	vshuff64x2	$221,%%zmm9 ,%%zmm1 ,%%zmm9 	\n\t"\
		"vshuff64x2	$136,%%zmm14,%%zmm6 ,%%zmm0 	\n\t	vshuff64x2	$136,%%zmm15,%%zmm7 ,%%zmm1 	\n\t"\
		"vshuff64x2	$221,%%zmm14,%%zmm6 ,%%zmm14	\n\t	vshuff64x2	$221,%%zmm15,%%zmm7 ,%%zmm15	\n\t"\
		/* Multiply normalized, re-permuted transform outputs by forward IBDWT weights ... r8-r15 still hold icycleA-H: */\
		"vmulpd		(%%rdx,%%r8 ),%%zmm10,%%zmm10		\n\t	vmulpd		(%%rdx,%%r8 ),%%zmm11,%%zmm11	\n\t"\
		"vmulpd		(%%rdx,%%r9 ),%%zmm12,%%zmm12		\n\t	vmulpd		(%%rdx,%%r9 ),%%zmm13,%%zmm13	\n\t"\
		"vmulpd		(%%rdx,%%r10),%%zmm16,%%zmm16		\n\t	vmulpd		(%%rdx,%%r10),%%zmm17,%%zmm17	\n\t"\
		"vmulpd		(%%rdx,%%r11),%%zmm0 ,%%zmm0 		\n\t	vmulpd		(%%rdx,%%r11),%%zmm1 ,%%zmm1 	\n\t"\
		"vmulpd		(%%rdx,%%r12),%%zmm2 ,%%zmm2 		\n\t	vmulpd		(%%rdx,%%r12),%%zmm3 ,%%zmm3 	\n\t"\
		"vmulpd		(%%rdx,%%r13),%%zmm4 ,%%zmm4 		\n\t	vmulpd		(%%rdx,%%r13),%%zmm5 ,%%zmm5 	\n\t"\
		"vmulpd		(%%rdx,%%r14),%%zmm8 ,%%zmm8 		\n\t	vmulpd		(%%rdx,%%r14),%%zmm9 ,%%zmm9 	\n\t"\
		"vmulpd		(%%rdx,%%r15),%%zmm14,%%zmm14		\n\t	vmulpd		(%%rdx,%%r15),%%zmm15,%%zmm15	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm11,0x040(%%rax)	\n\t"\
		"vmovaps	%%zmm12,0x080(%%rax)				\n\t	vmovaps		%%zmm13,0x0c0(%%rax)	\n\t"\
		"vmovaps	%%zmm16,0x100(%%rax)				\n\t	vmovaps		%%zmm17,0x140(%%rax)	\n\t"\
		"vmovaps	%%zmm0 ,0x180(%%rax)				\n\t	vmovaps		%%zmm1 ,0x1c0(%%rax)	\n\t"\
		"vmovaps	%%zmm2 ,0x200(%%rax)				\n\t	vmovaps		%%zmm3 ,0x240(%%rax)	\n\t"\
		"vmovaps	%%zmm4 ,0x280(%%rax)				\n\t	vmovaps		%%zmm5 ,0x2c0(%%rax)	\n\t"\
		"vmovaps	%%zmm8 ,0x300(%%rax)				\n\t	vmovaps		%%zmm9 ,0x340(%%rax)	\n\t"\
		"vmovaps	%%zmm14,0x380(%%rax)				\n\t	vmovaps		%%zmm15,0x3c0(%%rax)	\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"e" (Xcy_im)	/* Use literal-byte-offset for this ome to save a reg */\
		/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, resp. - assumed << l2_sz_vd on input: */\
		,	[__odd_radix]   "e" (Xodd_radix)\
		,	[__odd_radm2]   "e" (Xodd_radm2)\
		,	[__odd_radm3]   "e" (Xodd_radm3)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Need octet of ascending [modulo odd_radix] icycle indices for IBDWT weights: */\
		,	[__icycleA]		"m" (XicycleA)\
		,	[__icycleB]		"m" (XicycleB)\
		,	[__icycleC]		"m" (XicycleC)\
		,	[__icycleD]		"m" (XicycleD)\
		,	[__icycleE]		"m" (XicycleE)\
		,	[__icycleF]		"m" (XicycleF)\
		,	[__icycleG]		"m" (XicycleG)\
		,	[__icycleH]		"m" (XicycleH)\
		/* Need octet of same-index [i,j,k,l]cycle indices for negacyclic weights and base/baseinv normalizations: */\
		,	[__jcycleA]		"m" (XjcycleA)\
		,	[__kcycleA]		"m" (XkcycleA)\
		,	[__lcycleA]		"m" (XlcycleA)\
		,	[__mcycleA]		"m" (XmcycleA)\
		,	[__ncycleA]		"m" (XncycleA)\
		,	[__ocycleA]		"m" (XocycleA)\
		,	[__pcycleA]		"m" (XpcycleA)\
		/* Prefetch: base address and 4 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		,	[__p4] "m" (Xp4)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","r8","r9","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23","xmm24"	/* Clobbered registers */\
	);\
	}

  #elif defined(USE_AVX2)	// FMA-based versions of selected macros in this file for Intel AVX2/FMA3

	// Power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro:
	#define SSE2_fermat_carry_norm_pow2_errcheck_X4(Xdata,Xbase_root,Xcmul_offset,Xcy_re,Xcy_im,Xhalf_arr,Xsign_mask, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__add0],%%r14		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root] ,%%rax			\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movslq	%[__cmul_offset],%%rbx			\n\t"\
		"addq	%%rax,%%rbx	\n\t"/* Index into complex const multipliers block, each applied to 4 sets of base roots */\
		/* Up-multiply quartet of negacyclic roots used in this macro invocation; store sets 2-4 back into mem, keep set 1 in ymm10,11 [that's why we do sets 1/2 after 3/4] */\
		"vmovaps	    (%%rbx),%%ymm10		\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x20(%%rbx),%%ymm11		\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		"/* Sets 3/4: */"\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
		"/* Sets 1/2: */"\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"/* s.y */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"/* s.x */\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm10	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm11	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"											vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"											vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	0x40(%%rdx),%%ymm12	\n\t"/* xmm12 = scale */\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
	/* Apply inverse-complex-runlength scaling factor to the data: */\
		"vmulpd		%%ymm12,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm12,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm12,%%ymm6,%%ymm6					\n\t		vmulpd		%%ymm12,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm12,%%ymm0,%%ymm0					\n\t		vmulpd		%%ymm12,%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm12,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm12,%%ymm5,%%ymm5	\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
	/*	"vmovaps	-0x20(%%rdx),%%ymm15	\n\t"// rnd_const; prefer ROUNDPD in AVX mode, so ymm15 free */\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
		"movq		%[__cy_im],%%rcx		\n\t"\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%r14)	\n\t"\
		/* For a-quartet, needed negacyclic root already in ymm10/11: */\
		/* Data in ymm0,ymm1 */\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd	(%%rbx),%%ymm15,%%ymm0	\n\t vfmadd213pd	(%%rcx),%%ymm15,%%ymm1	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm9 ,%%ymm1,%%ymm1	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm2,ymm3 */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd	(%%rbx),%%ymm15,%%ymm2	\n\t vfmadd213pd	(%%rcx),%%ymm15,%%ymm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm2,%%ymm2	\n\t	vsubpd		%%ymm9 ,%%ymm3,%%ymm3	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm4,ymm5 */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd	(%%rbx),%%ymm15,%%ymm4	\n\t vfmadd213pd	(%%rcx),%%ymm15,%%ymm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm9 ,%%ymm5,%%ymm5	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm6,ymm7 */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd	(%%rbx),%%ymm15,%%ymm6	\n\t vfmadd213pd	(%%rcx),%%ymm15,%%ymm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm6,%%ymm6	\n\t	vsubpd		%%ymm9 ,%%ymm7,%%ymm7	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cmul_offset] "m" (Xcmul_offset)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"m" (Xcy_im)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.

	Key differences vs pow2 version:
	- Use odd_radix as index offset into local storage for IBDWT weights and variable base/baseinv terms;
	- Apply inv/fwd IBDWT weights bookending the negacyclic weights;
	- Value of base/baseinv to be applied to output taken from odd_radix-length array, using same index as for selecting IBDWT weight.

	The array indices i/j/k/lcycle declared int in caller but assumed to have been byte-shift-converted at time this macro called,
	thus can use as complex-address-offsets.  Use bytewise literal offsets to save registers for several args here,as vvv-marked:
												                                           vvvvv The [1,2,3]-multiples of odd_radix assumed << l2_sz_vd on input */
	#define SSE2_fermat_carry_norm_errcheck_X4_hiacc(Xdata,Xbase_root,Xcmul_offset,Xcy_re,Xcy_im,Xodd_radix,Xodd_radm2,Xodd_radm3,Xhalf_arr,Xsign_mask,XicycleA,XicycleB,XicycleC,XicycleD, XjcycleA,XkcycleA,XlcycleA, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq		%[__add0],%%rcx		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root]  ,%%rax			\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movslq	%[__cmul_offset],%%rbx			\n\t"\
		"addq	%%rax,%%rbx	\n\t"/* Index into complex const multipliers block, each applied to 4 sets of base roots */\
		/* Up-multiply quartet of negacyclic roots used in this macro invocation; store sets 2-4 back into mem, keep set 1 in ymm10,11 [that's why we do sets 1/2 after 3/4] */\
		"vmovaps	    (%%rbx),%%ymm10		\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x20(%%rbx),%%ymm11		\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		/* Sets 3/4: */\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
		/* Sets 1/2: */\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"/* s.y */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"/* s.x */\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm10	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm11	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"											vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"											vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"addq		$%c[__odd_radix],%%rdx				\n\t"/* wt|wtinv|base|baseinv data offset by icycle array slots from resp. base addresses */\
		/* Multiply complex transform outputs [x,y] = [re,im] by inverse IBDWT weights, which include the 2/n scale factor: */\
		"movslq		%[__icycleA],%%rdi		\n\t"\
		"movslq		%[__icycleB],%%r9 		\n\t"\
		"movslq		%[__icycleC],%%r8 		\n\t"\
		"movslq		%[__icycleD],%%r10		\n\t"\
		"vmovaps	(%%rdx,%%rdi),%%ymm12	\n\t"/* [wtinv0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm13	\n\t"/* [wtinv0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm14	\n\t"/* [wtinv0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm15	\n\t"/* [wtinv0-3]D */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
		"vmovaps	     (%%rax),%%ymm4						\n\t		vmovaps	0x020(%%rax),%%ymm5							\n\t"\
		"vmovaps	0x040(%%rax),%%ymm2						\n\t		vmovaps	0x060(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm12,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm12,%%ymm5,%%ymm5					\n\t"\
		"vmulpd		%%ymm13,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm13,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm4,%%ymm6				\n\t		vshufpd	$15,%%ymm3,%%ymm5,%%ymm7					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm4,%%ymm4				\n\t		vshufpd	$0 ,%%ymm3,%%ymm5,%%ymm5					\n\t"\
		"vmovaps	0x080(%%rax),%%ymm8						\n\t		vmovaps	0x0a0(%%rax),%%ymm9							\n\t"\
		"vmovaps	0x0c0(%%rax),%%ymm2						\n\t		vmovaps	0x0e0(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm14,%%ymm8,%%ymm8					\n\t		vmulpd		%%ymm14,%%ymm9,%%ymm9					\n\t"\
		"vmulpd		%%ymm15,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm15,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm8,%%ymm0				\n\t		vshufpd	$15,%%ymm3,%%ymm9,%%ymm1					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm8,%%ymm8				\n\t		vshufpd	$0 ,%%ymm3,%%ymm9,%%ymm9					\n\t"\
		"vperm2f128 $32,%%ymm0,%%ymm6,%%ymm2	/* Re B	*/	\n\t		vperm2f128 $32,%%ymm1,%%ymm7,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm0,%%ymm6,%%ymm6	/* Re D	*/	\n\t		vperm2f128 $49,%%ymm1,%%ymm7,%%ymm7		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm8,%%ymm4,%%ymm0	/* Re A	*/	\n\t		vperm2f128 $32,%%ymm9,%%ymm5,%%ymm1 	/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm8,%%ymm4,%%ymm4	/* Re C	*/	\n\t		vperm2f128 $49,%%ymm9,%%ymm5,%%ymm5 	/* Im C	*/	\n\t"\
		"subq		$%c[__odd_radix],%%rdx				\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
		"addq		%%rdi,%%rdx				\n\t"/* icycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%rcx)	\n\t"\
		/* For a-quartet, needed negacyclic root already in ymm10/11: */\
		/* Data in ymm0,ymm1 */\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm0	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm1	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm9 ,%%ymm1,%%ymm1	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__jcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* jcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm2,ymm3 */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm2	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm2,%%ymm2	\n\t	vsubpd		%%ymm9 ,%%ymm3,%%ymm3	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__kcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* kcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm4,ymm5 */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm4	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm9 ,%%ymm5,%%ymm5	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__lcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* lcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm6,ymm7 */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm6	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm6,%%ymm6	\n\t	vsubpd		%%ymm9 ,%%ymm7,%%ymm7	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* Multiply normalized, re-permuted transform outputs by forward IBDWT weights: */\
		"movslq		%[__icycleA],%%rdi		\n\t"\
		"vmovaps	(%%rdx,%%rdi),%%ymm4	\n\t"/* [wt0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm5	\n\t"/* [wt0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm6	\n\t"/* [wt0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm7	\n\t"/* [wt0-3]D */\
		"vmulpd		%%ymm4,%%ymm12,%%ymm12						\n\t		vmulpd		%%ymm4,%%ymm13,%%ymm13			\n\t"\
		"vmulpd		%%ymm5,%%ymm2 ,%%ymm2 						\n\t		vmulpd		%%ymm5,%%ymm3 ,%%ymm3 			\n\t"\
		"vmulpd		%%ymm6,%%ymm0 ,%%ymm0 						\n\t		vmulpd		%%ymm6,%%ymm1 ,%%ymm1 			\n\t"\
		"vmulpd		%%ymm7,%%ymm10,%%ymm10						\n\t		vmulpd		%%ymm7,%%ymm11,%%ymm11			\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cmul_offset] "m" (Xcmul_offset)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]	"e" (Xcy_im)	/* Use literal-byte-offset for this ome to save a reg */\
		/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, resp. - assumed << l2_sz_vd on input: */\
		,	[__odd_radix]   "e" (Xodd_radix)\
		,	[__odd_radm2]   "e" (Xodd_radm2)\
		,	[__odd_radm3]   "e" (Xodd_radm3)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Need quartet of ascending [modulo odd_radix] icycle indices for IBDWT weights: */\
		,	[__icycleA]		"m" (XicycleA)\
		,	[__icycleB]		"m" (XicycleB)\
		,	[__icycleC]		"m" (XicycleC)\
		,	[__icycleD]		"m" (XicycleD)\
		/* Need quartet of same-index [i,j,k,l]cycle indices for negacyclic weights and base/baseinv normalizations: */\
		,	[__jcycleA]		"m" (XjcycleA)\
		,	[__kcycleA]		"m" (XkcycleA)\
		,	[__lcycleA]		"m" (XlcycleA)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"   /* Clobbered registers */\
	);\
	}

	#define SSE2_fermat_carry_init_loacc(Xbase_root)\
	{\
	__asm__ volatile (\
		"movq		%[__base_root] ,%%rax	\n\t	"/* Base negacyclic roots at this address +8*0x20 (Re parts), +9*0x20 (Imag parts) */\
		"vmovaps	0x100(%%rax),%%ymm10	\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x120(%%rax),%%ymm11	\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
	"vfnmadd231pd	%%ymm11,%%ymm5,%%ymm0 	\n\t vfnmadd231pd	%%ymm11,%%ymm7,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
	" vfmadd231pd	%%ymm11,%%ymm4,%%ymm1 	\n\t  vfmadd231pd	%%ymm11,%%ymm6,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t	vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0x20(%%rax)		\n\t	vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"/* Process next 2 base-root quartets: */"\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
	"vfnmadd231pd	%%ymm11,%%ymm5,%%ymm0 	\n\t vfnmadd231pd	%%ymm11,%%ymm7,%%ymm2	\n\t"\
	" vfmadd231pd	%%ymm11,%%ymm4,%%ymm1 	\n\t  vfmadd231pd	%%ymm11,%%ymm6,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
	:						/* outputs: none */\
	:	[__base_root]	"m" (Xbase_root)	/* All inputs from memory addresses here */\
		: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm10","xmm11"   /* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.
	The array indices i/j/k/lcycle declared int in caller but assumed to have been byte-shift-converted at time this macro called,
	thus can use as complex-address-offsets.  Use bytewise literal offsets to save registers for several args here,as vvv-marked:
												                             vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv */
	#define SSE2_fermat_carry_norm_errcheck_X4_loacc(Xdata,Xbase_root,Xcy_re,Xcy_im,Xodd_radix,Xodd_radm2,Xodd_radm3,Xhalf_arr,Xsign_mask,XicycleA,XicycleB,XicycleC,XicycleD, XjcycleA,XkcycleA,XlcycleA, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq		%[__add0],%%rcx		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"addq		$%c[__odd_radix],%%rdx	\n\t"/* wt|wtinv|base|baseinv data offset by icycle array slots from resp. base addresses */\
		/* Multiply complex transform outputs [x,y] = [re,im] by inverse IBDWT weights, which include the 2/n scale factor: */\
		"movslq		%[__icycleA],%%r15		\n\t"\
		"movslq		%[__icycleB],%%r9 		\n\t"\
		"movslq		%[__icycleC],%%r8 		\n\t"\
		"movslq		%[__icycleD],%%r10		\n\t"\
		"vmovaps	(%%rdx,%%r15),%%ymm10	\n\t"/* [wtinv0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm11	\n\t"/* [wtinv0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm12	\n\t"/* [wtinv0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm13	\n\t"/* [wtinv0-3]D */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
		"vmovaps	     (%%rax),%%ymm4						\n\t		vmovaps	0x020(%%rax),%%ymm5							\n\t"\
		"vmovaps	0x040(%%rax),%%ymm2						\n\t		vmovaps	0x060(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm10,%%ymm5,%%ymm5					\n\t"\
		"vmulpd		%%ymm11,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm11,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm4,%%ymm6				\n\t		vshufpd	$15,%%ymm3,%%ymm5,%%ymm7					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm4,%%ymm4				\n\t		vshufpd	$0 ,%%ymm3,%%ymm5,%%ymm5					\n\t"\
		"vmovaps	0x080(%%rax),%%ymm8						\n\t		vmovaps	0x0a0(%%rax),%%ymm9							\n\t"\
		"vmovaps	0x0c0(%%rax),%%ymm2						\n\t		vmovaps	0x0e0(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8					\n\t		vmulpd		%%ymm12,%%ymm9,%%ymm9					\n\t"\
		"vmulpd		%%ymm13,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm13,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm8,%%ymm0				\n\t		vshufpd	$15,%%ymm3,%%ymm9,%%ymm1					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm8,%%ymm8				\n\t		vshufpd	$0 ,%%ymm3,%%ymm9,%%ymm9					\n\t"\
		"vperm2f128 $32,%%ymm0,%%ymm6,%%ymm2	/* Re B	*/	\n\t		vperm2f128 $32,%%ymm1,%%ymm7,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm0,%%ymm6,%%ymm6	/* Re D	*/	\n\t		vperm2f128 $49,%%ymm1,%%ymm7,%%ymm7		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm8,%%ymm4,%%ymm0	/* Re A	*/	\n\t		vperm2f128 $32,%%ymm9,%%ymm5,%%ymm1 	/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm8,%%ymm4,%%ymm4	/* Re C	*/	\n\t		vperm2f128 $49,%%ymm9,%%ymm5,%%ymm5 	/* Im C	*/	\n\t"\
		"subq		$%c[__odd_radix],%%rdx				\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
		"addq		%%r15,%%rdx				\n\t"/* icycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	    (%%rax),%%ymm10		\n\t"/* c = Re part of 1st base-root quartet */\
		"vmovaps	0x20(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%rcx)	\n\t"\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%ymm11,%%ymm9,%%ymm0 	\n\t"/* wt_im*[y copy] ...[a0-3.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%ymm11,%%ymm8,%%ymm1 	\n\t"/* wt_im*[x copy] ...[a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm0	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm1	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm13	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
	"vfnmadd231pd	%%ymm13,%%ymm8,%%ymm0 	\n\t vfnmadd231pd	%%ymm13,%%ymm9,%%ymm1	\n\t"/* base[0]*[cy0-3.re|im] ... xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm0 	\n\t"/* wt_im*[y copy] ... [a0-3.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm1 	\n\t"/* wt_im*[x copy] ... [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm12	\n\t"/*  ymm8  = s.x ... ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm10	\n\t"/*  ymm9  = s.y ... ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vmovaps	%%ymm12,0x20(%%rax)		\n\t"/* Im part */\
		"vmovaps	%%ymm10,    (%%rax)		\n\t"/* Store result, overwriting the old base root */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__jcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* jcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%ymm11,%%ymm9,%%ymm2 	\n\t"/* wt_im*[y copy] ...[a0-3.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%ymm11,%%ymm8,%%ymm3 	\n\t"/* wt_im*[x copy] ...[a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm2	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm13	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
	"vfnmadd231pd	%%ymm13,%%ymm8,%%ymm2 	\n\t vfnmadd231pd	%%ymm13,%%ymm9,%%ymm3	\n\t"/* base[0]*[cy0-3.re|im] ... xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm2 	\n\t"/* wt_im*[y copy] ... [a0-3.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm3 	\n\t"/* wt_im*[x copy] ... [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm12	\n\t"/*  ymm8  = s.x ... ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm10	\n\t"/*  ymm9  = s.y ... ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vmovaps	%%ymm12,0x60(%%rax)		\n\t"/* Im part */\
		"vmovaps	%%ymm10,0x40(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__kcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* kcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%ymm11,%%ymm9,%%ymm4 	\n\t"/* wt_im*[y copy] ...[a0-3.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%ymm11,%%ymm8,%%ymm5 	\n\t"/* wt_im*[x copy] ...[a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm4	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm13	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
	"vfnmadd231pd	%%ymm13,%%ymm8,%%ymm4 	\n\t vfnmadd231pd	%%ymm13,%%ymm9,%%ymm5	\n\t"/* base[0]*[cy0-3.re|im] ... xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm4 	\n\t"/* wt_im*[y copy] ... [a0-3.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm5 	\n\t"/* wt_im*[x copy] ... [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm12	\n\t"/*  ymm8  = s.x ... ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm10	\n\t"/*  ymm9  = s.y ... ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vmovaps	%%ymm12,0xa0(%%rax)		\n\t"/* Im part */\
		"vmovaps	%%ymm10,0x80(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__lcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* lcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
	" vfmadd231pd	%%ymm11,%%ymm9,%%ymm6 	\n\t"/* wt_im*[y copy] ...[a0-3.re] = x*wt_re + y*wt_im */\
	"vfnmadd231pd	%%ymm11,%%ymm8,%%ymm7 	\n\t"/* wt_im*[x copy] ...[a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
	"vfmadd213pd (%%rbx),%%ymm15,%%ymm6	\n\t vfmadd213pd %c[__cy_im](%%rbx),%%ymm15,%%ymm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm13	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
	"vfnmadd231pd	%%ymm13,%%ymm8,%%ymm6 	\n\t vfnmadd231pd	%%ymm13,%%ymm9,%%ymm7	\n\t"/* base[0]*[cy0-3.re|im] ... xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm6 	\n\t"/* wt_im*[y copy] ... [a0-3.re] = x*wt_re - y*wt_im */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm7 	\n\t"/* wt_im*[x copy] ... [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
	" vfmadd231pd	%%ymm11,%%ymm8,%%ymm12	\n\t"/*  ymm8  = s.x ... ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
	"vfnmadd231pd	%%ymm11,%%ymm9,%%ymm10	\n\t"/*  ymm9  = s.y ... ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vmovaps	%%ymm12,0xe0(%%rax)		\n\t"/* Im part */\
		"vmovaps	%%ymm10,0xc0(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* Multiply normalized, re-permuted transform outputs by forward IBDWT weights: */\
		"movslq		%[__icycleA],%%r15		\n\t"\
		"vmovaps	(%%rdx,%%r15),%%ymm4	\n\t"/* [wt0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm5	\n\t"/* [wt0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm6	\n\t"/* [wt0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm7	\n\t"/* [wt0-3]D */\
		"vmulpd		%%ymm4,%%ymm12,%%ymm12						\n\t		vmulpd		%%ymm4,%%ymm13,%%ymm13			\n\t"\
		"vmulpd		%%ymm5,%%ymm2 ,%%ymm2 						\n\t		vmulpd		%%ymm5,%%ymm3 ,%%ymm3 			\n\t"\
		"vmulpd		%%ymm6,%%ymm0 ,%%ymm0 						\n\t		vmulpd		%%ymm6,%%ymm1 ,%%ymm1 			\n\t"\
		"vmulpd		%%ymm7,%%ymm10,%%ymm10						\n\t		vmulpd		%%ymm7,%%ymm11,%%ymm11			\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"e" (Xcy_im)	/* Use literal-byte-offset for this ome to save a reg */\
		/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, resp. - assumed << l2_sz_vd on input: */\
		,	[__odd_radix]   "e" (Xodd_radix)\
		,	[__odd_radm2]   "e" (Xodd_radm2)\
		,	[__odd_radm3]   "e" (Xodd_radm3)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Need quartet of ascending [modulo odd_radix] icycle indices for IBDWT weights: */\
		,	[__icycleA]		"m" (XicycleA)\
		,	[__icycleB]		"m" (XicycleB)\
		,	[__icycleC]		"m" (XicycleC)\
		,	[__icycleD]		"m" (XicycleD)\
		/* Need quartet of same-index [i,j,k,l]cycle indices for negacyclic weights and base/baseinv normalizations: */\
		,	[__jcycleA]		"m" (XjcycleA)\
		,	[__kcycleA]		"m" (XkcycleA)\
		,	[__lcycleA]		"m" (XlcycleA)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r8","r9","r10","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

  #else	// USE_AVX, no AVX2/FMA3 support:

	/* Power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.

	NOTE: The array indices i/j/k/lcycle declared int in caller but assumed to have been
	byte-shift-converted at time this macro called, thus can use as complex-address-offsets.
	*/
	#define SSE2_fermat_carry_norm_pow2_errcheck_X4(Xdata,Xbase_root,Xcmul_offset,Xcy_re,Xcy_im,Xhalf_arr,Xsign_mask, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__add0],%%r14		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root] ,%%rax			\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movslq	%[__cmul_offset],%%rbx			\n\t"\
		"addq	%%rax,%%rbx	\n\t"/* Index into complex const multipliers block, each applied to 4 sets of base roots */\
		/* Up-multiply quartet of negacyclic roots used in this macro invocation; store sets 2-4 back into mem, keep set 1 in ymm10,11 [that's why we do sets 1/2 after 3/4] */\
		"vmovaps	    (%%rbx),%%ymm10		\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x20(%%rbx),%%ymm11		\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		/* Sets 3/4: */\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
		/* Sets 1/2: */\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"/* s.y */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"/* s.x */\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm10	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm11	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"											vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"											vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	0x40(%%rdx),%%ymm12	\n\t"/* xmm12 = scale */\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
	/* Apply inverse-complex-runlength scaling factor to the data: */\
		"vmulpd		%%ymm12,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm12,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm12,%%ymm6,%%ymm6					\n\t		vmulpd		%%ymm12,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm12,%%ymm0,%%ymm0					\n\t		vmulpd		%%ymm12,%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm12,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm12,%%ymm5,%%ymm5	\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
	/*	"vmovaps	-0x20(%%rdx),%%ymm15	\n\t"// rnd_const; prefer ROUNDPD in AVX mode, so ymm15 free */\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
		"movq		%[__cy_im],%%rcx		\n\t"\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%r14)	\n\t"\
		/* For a-quartet, needed negacyclic root already in ymm10/11: */\
		/* Data in ymm0,ymm1 */\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm15,%%ymm1,%%ymm1	\n\t"\
		"vaddpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vaddpd		(%%rcx),%%ymm1,%%ymm1	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm9 ,%%ymm1,%%ymm1	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm2,ymm3 */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm2,%%ymm2	\n\t	vmulpd		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vaddpd		(%%rbx),%%ymm2,%%ymm2	\n\t	vaddpd		(%%rcx),%%ymm3,%%ymm3	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm2,%%ymm2	\n\t	vsubpd		%%ymm9 ,%%ymm3,%%ymm3	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm4,ymm5 */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm15,%%ymm5,%%ymm5	\n\t"\
		"vaddpd		(%%rbx),%%ymm4,%%ymm4	\n\t	vaddpd		(%%rcx),%%ymm5,%%ymm5	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm9 ,%%ymm5,%%ymm5	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm6,ymm7 */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	0x20(%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm6,%%ymm6	\n\t	vmulpd		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vaddpd		(%%rbx),%%ymm6,%%ymm6	\n\t	vaddpd		(%%rcx),%%ymm7,%%ymm7	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	(%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,(%%rcx)			\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm6,%%ymm6	\n\t	vsubpd		%%ymm9 ,%%ymm7,%%ymm7	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cmul_offset] "m" (Xcmul_offset)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"m" (Xcy_im)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.

	Key differences vs pow2 version:
	- Use odd_radix as index offset into local storage for IBDWT weights and variable base/baseinv terms;
	- Apply inv/fwd IBDWT weights bookending the negacyclic weights;
	- Value of base/baseinv to be applied to output taken from odd_radix-length array, using same index as for selecting IBDWT weight.

	The array indices i/j/k/lcycle declared int in caller but assumed to have been byte-shift-converted at time this macro called,
	thus can use as complex-address-offsets.  Use bytewise literal offsets to save registers for several args here,as vvv-marked:
												                                           vvvvv The [1,2,3]-multiples of odd_radix assumed << l2_sz_vd on input */
	#define SSE2_fermat_carry_norm_errcheck_X4_hiacc(Xdata,Xbase_root,Xcmul_offset,Xcy_re,Xcy_im,Xodd_radix,Xodd_radm2,Xodd_radm3,Xhalf_arr,Xsign_mask,XicycleA,XicycleB,XicycleC,XicycleD, XjcycleA,XkcycleA,XlcycleA, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq		%[__add0],%%rcx		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__base_root]  ,%%rax			\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		"movslq	%[__cmul_offset],%%rbx			\n\t"\
		"addq	%%rax,%%rbx	\n\t"/* Index into complex const multipliers block, each applied to 4 sets of base roots */\
		/* Up-multiply quartet of negacyclic roots used in this macro invocation; store sets 2-4 back into mem, keep set 1 in ymm10,11 [that's why we do sets 1/2 after 3/4] */\
		"vmovaps	    (%%rbx),%%ymm10		\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x20(%%rbx),%%ymm11		\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		"/* Sets 3/4: */"\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
		"/* Sets 1/2: */"\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"/* s.y */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"/* s.x */\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm10	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm11	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"											vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"											vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"addq		$%c[__odd_radix],%%rdx				\n\t"/* wt|wtinv|base|baseinv data offset by icycle array slots from resp. base addresses */\
		/* Multiply complex transform outputs [x,y] = [re,im] by inverse IBDWT weights, which include the 2/n scale factor: */\
		"movslq		%[__icycleA],%%rdi		\n\t"\
		"movslq		%[__icycleB],%%r9 		\n\t"\
		"movslq		%[__icycleC],%%r8 		\n\t"\
		"movslq		%[__icycleD],%%r10		\n\t"\
		"vmovaps	(%%rdx,%%rdi),%%ymm12	\n\t"/* [wtinv0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm13	\n\t"/* [wtinv0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm14	\n\t"/* [wtinv0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm15	\n\t"/* [wtinv0-3]D */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
		"vmovaps	     (%%rax),%%ymm4						\n\t		vmovaps	0x020(%%rax),%%ymm5							\n\t"\
		"vmovaps	0x040(%%rax),%%ymm2						\n\t		vmovaps	0x060(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm12,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm12,%%ymm5,%%ymm5					\n\t"\
		"vmulpd		%%ymm13,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm13,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm4,%%ymm6				\n\t		vshufpd	$15,%%ymm3,%%ymm5,%%ymm7					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm4,%%ymm4				\n\t		vshufpd	$0 ,%%ymm3,%%ymm5,%%ymm5					\n\t"\
		"vmovaps	0x080(%%rax),%%ymm8						\n\t		vmovaps	0x0a0(%%rax),%%ymm9							\n\t"\
		"vmovaps	0x0c0(%%rax),%%ymm2						\n\t		vmovaps	0x0e0(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm14,%%ymm8,%%ymm8					\n\t		vmulpd		%%ymm14,%%ymm9,%%ymm9					\n\t"\
		"vmulpd		%%ymm15,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm15,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm8,%%ymm0				\n\t		vshufpd	$15,%%ymm3,%%ymm9,%%ymm1					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm8,%%ymm8				\n\t		vshufpd	$0 ,%%ymm3,%%ymm9,%%ymm9					\n\t"\
		"vperm2f128 $32,%%ymm0,%%ymm6,%%ymm2	/* Re B	*/	\n\t		vperm2f128 $32,%%ymm1,%%ymm7,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm0,%%ymm6,%%ymm6	/* Re D	*/	\n\t		vperm2f128 $49,%%ymm1,%%ymm7,%%ymm7		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm8,%%ymm4,%%ymm0	/* Re A	*/	\n\t		vperm2f128 $32,%%ymm9,%%ymm5,%%ymm1 	/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm8,%%ymm4,%%ymm4	/* Re C	*/	\n\t		vperm2f128 $49,%%ymm9,%%ymm5,%%ymm5 	/* Im C	*/	\n\t"\
		"subq		$%c[__odd_radix],%%rdx				\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
		"addq		%%rdi,%%rdx				\n\t"/* icycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%rcx)	\n\t"\
		/* For a-quartet, needed negacyclic root already in ymm10/11: */\
		/* Data in ymm0,ymm1 */\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm15,%%ymm1,%%ymm1	\n\t"\
		"vaddpd		(%%rbx),%%ymm0,%%ymm0	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm1,%%ymm1\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm9 ,%%ymm1,%%ymm1	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__jcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* jcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm2,ymm3 */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm2,%%ymm2	\n\t	vmulpd		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vaddpd		(%%rbx),%%ymm2,%%ymm2	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm3,%%ymm3\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm2,%%ymm2	\n\t	vsubpd		%%ymm9 ,%%ymm3,%%ymm3	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__kcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* kcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm4,ymm5 */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm15,%%ymm5,%%ymm5	\n\t"\
		"vaddpd		(%%rbx),%%ymm4,%%ymm4	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm5,%%ymm5\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm9 ,%%ymm5,%%ymm5	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__lcycleA],%%rdi		\n\t"\
		"addq		%%rdi,%%rdx				\n\t"/* lcycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
		/* Data in ymm6,ymm7 */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm6,%%ymm6	\n\t	vmulpd		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vaddpd		(%%rbx),%%ymm6,%%ymm6	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm7,%%ymm7\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm6,%%ymm6	\n\t	vsubpd		%%ymm9 ,%%ymm7,%%ymm7	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* Multiply normalized, re-permuted transform outputs by forward IBDWT weights: */\
		"movslq		%[__icycleA],%%rdi		\n\t"\
		"vmovaps	(%%rdx,%%rdi),%%ymm4	\n\t"/* [wt0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm5	\n\t"/* [wt0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm6	\n\t"/* [wt0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm7	\n\t"/* [wt0-3]D */\
		"vmulpd		%%ymm4,%%ymm12,%%ymm12						\n\t		vmulpd		%%ymm4,%%ymm13,%%ymm13			\n\t"\
		"vmulpd		%%ymm5,%%ymm2 ,%%ymm2 						\n\t		vmulpd		%%ymm5,%%ymm3 ,%%ymm3 			\n\t"\
		"vmulpd		%%ymm6,%%ymm0 ,%%ymm0 						\n\t		vmulpd		%%ymm6,%%ymm1 ,%%ymm1 			\n\t"\
		"vmulpd		%%ymm7,%%ymm10,%%ymm10						\n\t		vmulpd		%%ymm7,%%ymm11,%%ymm11			\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cmul_offset] "m" (Xcmul_offset)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]	"e" (Xcy_im)	/* Use literal-byte-offset for this ome to save a reg */\
		/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, resp. - assumed << l2_sz_vd on input: */\
		,	[__odd_radix]   "e" (Xodd_radix)\
		,	[__odd_radm2]   "e" (Xodd_radm2)\
		,	[__odd_radm3]   "e" (Xodd_radm3)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Need quartet of ascending [modulo odd_radix] icycle indices for IBDWT weights: */\
		,	[__icycleA]		"m" (XicycleA)\
		,	[__icycleB]		"m" (XicycleB)\
		,	[__icycleC]		"m" (XicycleC)\
		,	[__icycleD]		"m" (XicycleD)\
		/* Need quartet of same-index [i,j,k,l]cycle indices for negacyclic weights and base/baseinv normalizations: */\
		,	[__jcycleA]		"m" (XjcycleA)\
		,	[__kcycleA]		"m" (XkcycleA)\
		,	[__lcycleA]		"m" (XlcycleA)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"   /* Clobbered registers */\
	);\
	}

	#define SSE2_fermat_carry_init_loacc(Xbase_root)\
	{\
	__asm__ volatile (\
		"movq		%[__base_root] ,%%rax	\n\t	"/* Base negacyclic roots at this address +8*0x20 (Re parts), +9*0x20 (Imag parts) */\
		"vmovaps	0x100(%%rax),%%ymm10	\n\t	"/* Multiply by exp(j*I*Pi/2)/RADIX, for j = 0-3 */\
		"vmovaps	0x120(%%rax),%%ymm11	\n\t	"/* c = Re(exp) in ymm0, s = Im(exp) in ymm1 */\
		"vmovaps	     (%%rax),%%ymm0		\n\t	vmovaps	 0x40(%%rax),%%ymm2			\n\t"/* x = Re part of 1st base-root quartet */\
		"vmovaps	 0x20(%%rax),%%ymm1		\n\t	vmovaps	 0x60(%%rax),%%ymm3			\n\t"/* y = Im part */\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"/* Copy x */\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"/* Copy y */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* c.x */\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"/* s.y */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* c.y */\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"/* s.x */\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"/* Out.re = c.x - s.y */\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"/* Out.im = c.y + s.x */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t	vmovaps		%%ymm2 ,0x40(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0x20(%%rax)		\n\t	vmovaps		%%ymm3 ,0x60(%%rax)		\n\t"/* Im part */\
		"/* Process next 2 base-root quartets: */"\
		"vmovaps	 0x80(%%rax),%%ymm0		\n\t	vmovaps	 0xc0(%%rax),%%ymm2			\n\t"\
		"vmovaps	 0xa0(%%rax),%%ymm1		\n\t	vmovaps	 0xe0(%%rax),%%ymm3			\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps		%%ymm2,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps		%%ymm3,%%ymm7			\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm11,%%ymm5,%%ymm5	\n\t	vmulpd		%%ymm11,%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t	vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm11,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm11,%%ymm6,%%ymm6	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm7 ,%%ymm2,%%ymm2	\n\t"\
		"vaddpd		%%ymm4 ,%%ymm1,%%ymm1	\n\t	vaddpd		%%ymm6 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t	vmovaps		%%ymm2 ,0xc0(%%rax)		\n\t"/* Store result, overwriting input base root */\
		"vmovaps	%%ymm1 ,0xa0(%%rax)		\n\t	vmovaps		%%ymm3 ,0xe0(%%rax)		\n\t"/* Im part */\
	:						/* outputs: none */\
	:	[__base_root]	"m" (Xbase_root)	/* All inputs from memory addresses here */\
		: "cc","memory","rax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm10","xmm11"   /* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.
	The array indices i/j/k/lcycle declared int in caller but assumed to have been byte-shift-converted at time this macro called,
	thus can use as complex-address-offsets.  Use bytewise literal offsets to save registers for several args here,as vvv-marked:
												                             vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv */
	#define SSE2_fermat_carry_norm_errcheck_X4_loacc(Xdata,Xbase_root,Xcy_re,Xcy_im,Xodd_radix,Xodd_radm2,Xodd_radm3,Xhalf_arr,Xsign_mask,XicycleA,XicycleB,XicycleC,XicycleD, XjcycleA,XkcycleA,XlcycleA, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%ymm15	\n\t"/* prp_mult, broadcast to all double-slots of ymm15 */\
		"movq		%[__add0],%%rcx		\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"addq		$%c[__odd_radix],%%rdx				\n\t"/* wt|wtinv|base|baseinv data offset by icycle array slots from resp. base addresses */\
		/* Multiply complex transform outputs [x,y] = [re,im] by inverse IBDWT weights, which include the 2/n scale factor: */\
		"movslq		%[__icycleA],%%r15		\n\t"\
		"movslq		%[__icycleB],%%r9 		\n\t"\
		"movslq		%[__icycleC],%%r8 		\n\t"\
		"movslq		%[__icycleD],%%r10		\n\t"\
		"vmovaps	(%%rdx,%%r15),%%ymm10	\n\t"/* [wtinv0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm11	\n\t"/* [wtinv0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm12	\n\t"/* [wtinv0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm13	\n\t"/* [wtinv0-3]D */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
		"vmovaps	     (%%rax),%%ymm4						\n\t		vmovaps	0x020(%%rax),%%ymm5							\n\t"\
		"vmovaps	0x040(%%rax),%%ymm2						\n\t		vmovaps	0x060(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4					\n\t		vmulpd		%%ymm10,%%ymm5,%%ymm5					\n\t"\
		"vmulpd		%%ymm11,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm11,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm4,%%ymm6				\n\t		vshufpd	$15,%%ymm3,%%ymm5,%%ymm7					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm4,%%ymm4				\n\t		vshufpd	$0 ,%%ymm3,%%ymm5,%%ymm5					\n\t"\
		"vmovaps	0x080(%%rax),%%ymm8						\n\t		vmovaps	0x0a0(%%rax),%%ymm9							\n\t"\
		"vmovaps	0x0c0(%%rax),%%ymm2						\n\t		vmovaps	0x0e0(%%rax),%%ymm3							\n\t"\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8					\n\t		vmulpd		%%ymm12,%%ymm9,%%ymm9					\n\t"\
		"vmulpd		%%ymm13,%%ymm2,%%ymm2					\n\t		vmulpd		%%ymm13,%%ymm3,%%ymm3					\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm8,%%ymm0				\n\t		vshufpd	$15,%%ymm3,%%ymm9,%%ymm1					\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm8,%%ymm8				\n\t		vshufpd	$0 ,%%ymm3,%%ymm9,%%ymm9					\n\t"\
		"vperm2f128 $32,%%ymm0,%%ymm6,%%ymm2	/* Re B	*/	\n\t		vperm2f128 $32,%%ymm1,%%ymm7,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm0,%%ymm6,%%ymm6	/* Re D	*/	\n\t		vperm2f128 $49,%%ymm1,%%ymm7,%%ymm7		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm8,%%ymm4,%%ymm0	/* Re A	*/	\n\t		vperm2f128 $32,%%ymm9,%%ymm5,%%ymm1 	/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm8,%%ymm4,%%ymm4	/* Re C	*/	\n\t		vperm2f128 $49,%%ymm9,%%ymm5,%%ymm5 	/* Im C	*/	\n\t"\
		"subq		$%c[__odd_radix],%%rdx				\n\t"\
		/* Base negacyclic roots at this address in [0,2,4,6]*0x20 (Re parts), [1,3,5,7]*0x20 (Imag parts) */\
		"movq		%[__sign_mask],%%rsi	\n\t"\
		"movq	%[__base_root] ,%%rax		\n\t"/* Won't need main-array again until output transpose, so re-use rax for base_root */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"vmovaps	-0x40(%%rdx),%%ymm13	\n\t"/* xmm13 = maxerr */\
		"addq		%%r15,%%rdx				\n\t"/* icycle assumed already in left-shifted ptr-byte-offset form */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	    (%%rax),%%ymm10		\n\t"/* c = Re part of 1st base-root quartet */\
		"vmovaps	0x20(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Do a-quartet: Data in ymm0,ymm1 */\
	"prefetcht0	(%%rcx)	\n\t"\
		"vmovaps	%%ymm13,%%ymm14			\n\t"/* maxerr copy */\
		"movq		%[__cy_re],%%rbx		\n\t"\
		"vmovaps	%%ymm0,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm1,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"vmovaps	%%ymm0,%%ymm8			\n\t	vmovaps		%%ymm1,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm0,%%ymm0		\n\t	vroundpd	$0,%%ymm1,%%ymm1	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm15,%%ymm1,%%ymm1	\n\t"\
		"vaddpd		(%%rbx),%%ymm0,%%ymm0	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm1,%%ymm1\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm9 ,%%ymm1,%%ymm1	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm0 ,%%ymm8			\n\t	vmovaps		%%ymm1 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm0,%%ymm0	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm1,%%ymm1	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
		"vmulpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ymm8  = s.x */\
		"vmulpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"/* ymm9  = s.y */\
		"vsubpd		%%ymm9 ,%%ymm10,%%ymm10	\n\t"/* ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vaddpd		%%ymm8 ,%%ymm12,%%ymm11	\n\t"/* ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
		"vmovaps	%%ymm10,    (%%rax)		\n\t"/* Store result, overwriting the old base root */\
		"vmovaps	%%ymm11,0x20(%%rax)		\n\t"/* Im part */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x40(%%rax),%%ymm10		\n\t"/* c = Re part of 2nd base-root quartet */\
		"vmovaps	0x60(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do b-quartet: Data in ymm2,ymm3 */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__jcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* jcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm2,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm3,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm2,%%ymm8			\n\t	vmovaps		%%ymm3,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm2,%%ymm2		\n\t	vroundpd	$0,%%ymm3,%%ymm3	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm2 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm3 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm2,%%ymm2	\n\t	vmulpd		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vaddpd		(%%rbx),%%ymm2,%%ymm2	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm3,%%ymm3\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm2,%%ymm2	\n\t	vsubpd		%%ymm9 ,%%ymm3,%%ymm3	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm2 ,%%ymm8			\n\t	vmovaps		%%ymm3 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm2,%%ymm2	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm3,%%ymm3	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
		"vmulpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ymm8  = s.x */\
		"vmulpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"/* ymm9  = s.y */\
		"vsubpd		%%ymm9 ,%%ymm10,%%ymm10	\n\t"/* ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vaddpd		%%ymm8 ,%%ymm12,%%ymm11	\n\t"/* ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
		"vmovaps	%%ymm10,0x40(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		"vmovaps	%%ymm11,0x60(%%rax)		\n\t"/* Im part */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0x80(%%rax),%%ymm10		\n\t"/* c = Re part of 3rd base-root quartet */\
		"vmovaps	0xa0(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do c-quartet: Data in ymm4,ymm5 */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__kcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* kcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm4,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm5,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm4,%%ymm8			\n\t	vmovaps		%%ymm5,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm4,%%ymm4		\n\t	vroundpd	$0,%%ymm5,%%ymm5	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm4 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm5 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm4,%%ymm4	\n\t	vmulpd		%%ymm15,%%ymm5,%%ymm5	\n\t"\
		"vaddpd		(%%rbx),%%ymm4,%%ymm4	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm5,%%ymm5\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm9 ,%%ymm5,%%ymm5	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm4 ,%%ymm8			\n\t	vmovaps		%%ymm5 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm4,%%ymm4	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm5,%%ymm5	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
		"vmulpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ymm8  = s.x */\
		"vmulpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"/* ymm9  = s.y */\
		"vsubpd		%%ymm9 ,%%ymm10,%%ymm10	\n\t"/* ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vaddpd		%%ymm8 ,%%ymm12,%%ymm11	\n\t"/* ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
		"vmovaps	%%ymm10,0x80(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		"vmovaps	%%ymm11,0xa0(%%rax)		\n\t"/* Im part */\
		/* Get next set of negacyclic roots: */\
		"vmovaps	0xc0(%%rax),%%ymm10		\n\t"/* c = Re part of 4th base-root quartet */\
		"vmovaps	0xe0(%%rax),%%ymm11		\n\t"/* s = Im part */\
	/* Now do d-quartet: Data in ymm6,ymm7 */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%rcx,%%r15,8)	\n\t"\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,%%ymm13			\n\t"/* maxerr copy */\
		"movslq		%[__lcycleA],%%r15		\n\t"\
		"addq		%%r15,%%rdx				\n\t"/* lcycle assumed already in left-shifted ptr-byte-offset form */\
		"vmovaps	%%ymm6,%%ymm8			\n\t"/* x copy */\
		"vmovaps	%%ymm7,%%ymm9			\n\t"/* y copy */\
		/* Inverse negacyclic weight is (wt_re, -wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vaddpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re + y*wt_im */\
		"vsubpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re - x*wt_im */\
		/* normalize, compute carryout, compute ROE: */\
		"vmovaps	%%ymm6,%%ymm8			\n\t	vmovaps		%%ymm7,%%ymm9		\n\t"/* copy x|y */\
		"vroundpd	$0,%%ymm6,%%ymm6		\n\t	vroundpd	$0,%%ymm7,%%ymm7	\n\t"/* temp = DNINT(x|y) */\
		"vmovaps	%c[__odd_radm3](%%rdx),%%ymm12	\n\t"/* [baseinv0-3] */\
		"vsubpd		%%ymm6 ,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm7 ,%%ymm9 ,%%ymm9 	\n\t"/* frac = [x - temp] */\
		"vandpd		(%%rsi),%%ymm8 ,%%ymm8 	\n\t	vandpd		(%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* frac = fabs(frac) */\
		"vmulpd		%%ymm15,%%ymm6,%%ymm6	\n\t	vmulpd		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vaddpd		(%%rbx),%%ymm6,%%ymm6	\n\t vaddpd	%c[__cy_im](%%rbx),%%ymm7,%%ymm7\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm13,%%ymm8 ,%%ymm13	\n\t	vmaxpd		%%ymm14,%%ymm9 ,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy temp */\
		"vmulpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* temp*baseinv[0] ... inline the remaining +odd_radix offset in addressing */\
		"vmaxpd		%%ymm13,%%ymm14,%%ymm14	\n\t"/* merge re|im maxerr vectors */\
		"vroundpd	$0,%%ymm8 ,%%ymm8 		\n\t	vroundpd	$0,%%ymm9,%%ymm9		\n\t"/* [cy0-3.re] = DNINT(temp*baseinv[0]) */\
		"vmovaps	%c[__odd_radm2](%%rdx),%%ymm12	\n\t"/* [base0-3] */\
		"vmovaps	%%ymm8,(%%rbx)			\n\t	vmovaps		%%ymm9,%c[__cy_im](%%rbx)\n\t"/* store [cy0-3.re|im] */\
		"vmulpd		%%ymm12,%%ymm8,%%ymm8	\n\t	vmulpd		%%ymm12,%%ymm9,%%ymm9	\n\t"/* base[0]*[cy0-3.re|im] */\
		"vsubpd		%%ymm8 ,%%ymm6,%%ymm6	\n\t	vsubpd		%%ymm9 ,%%ymm7,%%ymm7	\n\t"/* xmm0|1 = [a0-3.re|im] = temp - [cy0-3.re|im]*base[0] */\
		"vmovaps	%%ymm6 ,%%ymm8			\n\t	vmovaps		%%ymm7 ,%%ymm9			\n\t"/* cpy x|y */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"/* wt_re*[x     ] */\
		"vmulpd		%%ymm11,%%ymm9,%%ymm9	\n\t"/* wt_im*[y copy] */\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"/* wt_re*[y     ] */\
		"vmulpd		%%ymm11,%%ymm8,%%ymm8	\n\t"/* wt_im*[x copy] */\
		"vsubpd		%%ymm9 ,%%ymm6,%%ymm6	\n\t"/* [a0-3.re] = x*wt_re - y*wt_im */\
		"vaddpd		%%ymm8 ,%%ymm7,%%ymm7	\n\t"/* [a0-3.im] = y*wt_re + x*wt_im */\
		/* Up-multiply negacyclic roots stored in ymm10,11 by exp(j*I*Pi/2)/RADIX, for j = 4 */\
		"vmovaps	0x140(%%rax),%%ymm8 	\n\t"/* x = Re(exp) in ymm10 */\
		"vmovaps	0x160(%%rax),%%ymm9 	\n\t"/* y = Im(exp) in ymm11 */\
		"vmulpd		%%ymm10,%%ymm9 ,%%ymm12	\n\t"/* ymm12 = c.y */\
		"vmulpd		%%ymm10,%%ymm8 ,%%ymm10	\n\t"/* ymm10 = c.x */\
		"vmulpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ymm8  = s.x */\
		"vmulpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"/* ymm9  = s.y */\
		"vsubpd		%%ymm9 ,%%ymm10,%%ymm10	\n\t"/* ymm10 = wt.re = c.x - s.y; ymm9  free */\
		"vaddpd		%%ymm8 ,%%ymm12,%%ymm11	\n\t"/* ymm11 = wt.im = s.x + c.y; ymm8 ,4 free */\
		"vmovaps	%%ymm10,0xc0(%%rax)		\n\t"/* Store result, overwriting the old base root */\
		"vmovaps	%%ymm11,0xe0(%%rax)		\n\t"/* Im part */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm14,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* Multiply normalized, re-permuted transform outputs by forward IBDWT weights: */\
		"movslq		%[__icycleA],%%r15		\n\t"\
		"vmovaps	(%%rdx,%%r15),%%ymm4	\n\t"/* [wt0-3]A */\
		"vmovaps	(%%rdx,%%r9 ),%%ymm5	\n\t"/* [wt0-3]B */\
		"vmovaps	(%%rdx,%%r8 ),%%ymm6	\n\t"/* [wt0-3]C */\
		"vmovaps	(%%rdx,%%r10),%%ymm7	\n\t"/* [wt0-3]D */\
		"vmulpd		%%ymm4,%%ymm12,%%ymm12						\n\t		vmulpd		%%ymm4,%%ymm13,%%ymm13			\n\t"\
		"vmulpd		%%ymm5,%%ymm2 ,%%ymm2 						\n\t		vmulpd		%%ymm5,%%ymm3 ,%%ymm3 			\n\t"\
		"vmulpd		%%ymm6,%%ymm0 ,%%ymm0 						\n\t		vmulpd		%%ymm6,%%ymm1 ,%%ymm1 			\n\t"\
		"vmulpd		%%ymm7,%%ymm10,%%ymm10						\n\t		vmulpd		%%ymm7,%%ymm11,%%ymm11			\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__base_root]	"m" (Xbase_root)\
		,	[__cy_re]		"m" (Xcy_re)\
		,	[__cy_im]		"e" (Xcy_im)	/* Use literal-byte-offset for this ome to save a reg */\
		/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, resp. - assumed << l2_sz_vd on input: */\
		,	[__odd_radix]   "e" (Xodd_radix)\
		,	[__odd_radm2]   "e" (Xodd_radm2)\
		,	[__odd_radm3]   "e" (Xodd_radm3)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		/* Need quartet of ascending [modulo odd_radix] icycle indices for IBDWT weights: */\
		,	[__icycleA]		"m" (XicycleA)\
		,	[__icycleB]		"m" (XicycleB)\
		,	[__icycleC]		"m" (XicycleC)\
		,	[__icycleD]		"m" (XicycleD)\
		/* Need quartet of same-index [i,j,k,l]cycle indices for negacyclic weights and base/baseinv normalizations: */\
		,	[__jcycleA]		"m" (XjcycleA)\
		,	[__kcycleA]		"m" (XkcycleA)\
		,	[__lcycleA]		"m" (XlcycleA)\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		,	[__p2] "m" (Xp2)\
		,	[__p3] "m" (Xp3)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r8","r9","r10","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

  #endif	// AVX2/FMA3?

#else	// SSE2

	/* Power-of-2-runlength Fermat-mod acyclic-transform carry macro. (No IBDWT needed for power-of-2 runlenghts).
	*/
	#define SSE2_fermat_carry_norm_pow2_errcheck(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xhalf_arr,Xsign_mask,Xadd1,Xadd2, Xadd0, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd	(%%rax),%%xmm15		\n\t	shufpd	$0,%%xmm15,%%xmm15	\n\t"/* prp_mult, broadcast to all double-slots of xmm15 */\
	"movq	%[__add0],%%r14	\n\t"\
	"prefetcht0	(%%r14)		\n\t"\
		"movslq	%[__idx_offset],%%rsi	\n\t"/* rsi stores [j + idx_offset], idx_offset starts = 0, gets incremented by idx_incr each macro invocation */\
		"movslq		%[__nrt_bits],%%rcx	\n\t"\
		"movslq		%[__nrtm1],%%rdi	\n\t"\
		"movq		%%rsi,%%rax			\n\t"/* j + idx_offset */\
		"shrq		$1,%%rax			\n\t"/* l = ((j + idx_offset) >> 1) */\
		"movq		%%rax,%%rbx			\n\t"\
		"andq		%%rdi,%%rax			\n\t"/* k1 = (l & __NRTM1) */\
		"shrq		%%cl,%%rbx			\n\t"/* k2=(l >> __NRT_BITS) */\
		"shlq		$4,%%rax			\n\t"/* 16 bytes for array-of-complex */\
		"shlq		$4,%%rbx			\n\t"/* 16 bytes for array-of-complex */\
		"addq		%[__add1],%%rax		\n\t"/* rn0[k1] */\
		"addq		%[__add2],%%rbx		\n\t"/* rn1[k2] */\
		"movaps		(%%rax),%%xmm0		\n\t"/* [c0,s0] */\
		"movaps		(%%rbx),%%xmm1		\n\t"/* [x0,y0] */\
		"movq		%%rsi,%%rax			\n\t"\
		"movaps		%%xmm1,%%xmm2		\n\t"/* [x0,y0] copy */\
		"shufpd	$1,	%%xmm2,%%xmm2		\n\t"/* [y0,x0] (swap re <--> im) */\
		"mulpd		%%xmm0,%%xmm1		\n\t"/* [c0.x0,s0.y0] */\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [c0.y0,s0.x0] 1,2 used */\
		/* Get next root for interleaving with the first: */\
		"addq		$2,%%rax			\n\t"\
		"shrq		$1,%%rax			\n\t"/* l = ((j + idx_offset) >> 1) */\
		"movq		%%rax,%%rbx			\n\t"\
		"andq		%%rdi,%%rax			\n\t"/* k1 = (l & __NRTM1) */\
		"shrq		%%cl,%%rbx			\n\t"/* k2=(l >> __NRT_BITS) */\
		"shlq		$4,%%rax			\n\t"/* 16 bytes for array-of-complex */\
		"shlq		$4,%%rbx			\n\t"/* 16 bytes for array-of-complex */\
		"addq		%[__add1],%%rax		\n\t"/* rn0[k1] */\
		"addq		%[__add2],%%rbx		\n\t"/* rn1[k2] */\
		"movaps		(%%rax),%%xmm0		\n\t"/* [c1,s1] */\
		"movaps		(%%rbx),%%xmm3		\n\t"/* [x1,y1] 0-3 used*/\
		"movq		%%rsi,%%rax			\n\t"\
		"movaps		%%xmm3,%%xmm4		\n\t"/* [x1,y1] copy */\
		"shufpd	$1,	%%xmm4,%%xmm4		\n\t"/* [y1,x1] (swap re <--> im) */\
		"mulpd		%%xmm0,%%xmm3		\n\t"/* [c1.x1,s1.y1] */\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [c1.y1,s1.x1] 1-4 used */\
		"movaps		%%xmm1,%%xmm0		\n\t"/* xmm0 <- copy [c0.x0,s0.y0] */\
		"unpcklpd	%%xmm3,%%xmm0		\n\t"/* [c0.x0,c1.x1] */\
		"unpckhpd	%%xmm3,%%xmm1		\n\t"/* [s0.y0,s1.y1], 0-2,4 used */\
		"subpd		%%xmm1,%%xmm0		\n\t"/* xmm0 = [wt_r0,wt_r1] 0,2,4 used */\
		"movaps		%%xmm2,%%xmm1		\n\t"/* xmm1 <- copy [c0.y0,s0.x0] 0-2,4 used */\
		"unpcklpd	%%xmm4,%%xmm1		\n\t"/* [c0.y0,c1.y1] */\
		"unpckhpd	%%xmm4,%%xmm2		\n\t"/* [s0.x0,s1.x1] */\
		"addpd		%%xmm2,%%xmm1		\n\t"/* xmm1 = [wt_i0,wt_i1] 0-1 used */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"movq		%[__half_arr],%%rcx	\n\t"/* No longer need __NRT_BITS, so reuse rcx */\
		/* Multiply the complex transform output [x,y] = [re,im] by any scale factor: [x,y] *= scale: */\
		"movq		%[__data],%%rdx		\n\t"\
		"movaps		     (%%rdx),%%xmm4	\n\t"/* x = [a.re,b.re] */\
		"movaps		 0x10(%%rdx),%%xmm2	\n\t"/* y = [a.im,b.im] */\
		"movaps		0x020(%%rcx),%%xmm5	\n\t"/* [scale,scale] */\
		"mulpd		%%xmm5,%%xmm4		\n\t"\
		"mulpd		%%xmm5,%%xmm2		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t"/* x copy */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* y copy */\
		/* Inverse weight is (wt_re, -wt_im): */\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [x     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm3		\n\t"/* [y copy]*wt_im */\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [y     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm5		\n\t"/* [x copy]*wt_im */\
		"addpd		%%xmm3,%%xmm4		\n\t"/* [a.re,b.re] = x*wt_re + y*wt_im */\
		"subpd		%%xmm5,%%xmm2		\n\t"/* [a.im,b.im] = y*wt_re - x*wt_im */\
		"movq		%[__cy],%%rbx		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t"/* [a.re,b.re] copy */\
		"shufpd	$0,	%%xmm2,%%xmm4		\n\t"/* xmm4 = x = [a.re,a.im] */\
		"shufpd	$3,	%%xmm2,%%xmm5		\n\t"/* xmm5 = y = [b.re,b.im] 0,1,4,5 used */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"movaps		-0x20(%%rcx),%%xmm6	\n\t"/* xmm6 = maxerr */\
		"movaps		-0x10(%%rcx),%%xmm7	\n\t"/* xmm7 = rnd_const */\
		"movaps		%%xmm4,%%xmm2		\n\t"/* copy x */\
		"addpd		%%xmm7,%%xmm4		\n\t"\
		"subpd		%%xmm7,%%xmm4		\n\t"/* temp = DNINT(x) */\
		"movq		%[__sign_mask],%%rax\n\t"\
		"subpd		%%xmm4,%%xmm2		\n\t"/* frac = [x - temp] */\
		"andpd		     (%%rax),%%xmm2	\n\t"/* frac = fabs(frac) */\
		"mulpd		%%xmm15,%%xmm4		\n\t"\
		"addpd		(%%rbx),%%xmm4		\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t"/* Note serialization here! */\
		"movaps		%%xmm4,%%xmm2		\n\t"/* cpy temp */\
		"mulpd		0x10(%%rcx),%%xmm2	\n\t"/* temp*baseinv[0] */\
		"addpd		%%xmm7,%%xmm2		\n\t"\
		"subpd		%%xmm7,%%xmm2		\n\t"/* [cx,cy] = DNINT(temp*baseinv[0]) */\
		"movaps		%%xmm2,(%%rbx)		\n\t"/* save carry to mem */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* cpy [cx,cy] */\
		"mulpd		    (%%rcx),%%xmm3	\n\t"/* [cx,cy]*base[0] */\
		"subpd		%%xmm3,%%xmm4		\n\t"/* xmm4 = [a.re,a.im] = temp-[cx,cy]*base[0] */\
		/* Now do b-pair: [b.re,b.im] in xmm5, carry in xmm2, xmm3 free, wt_[re,im] in xmmA,B, xmm6 free, rnd_const in xmm7: */\
		"movaps		%%xmm5,%%xmm2		\n\t"/* copy y */\
		"addpd		%%xmm7,%%xmm5		\n\t"\
		"subpd		%%xmm7,%%xmm5		\n\t"/* temp = DNINT(y) */\
		"subpd		%%xmm5,%%xmm2		\n\t"/* frac = [y - temp] */\
		"andpd		     (%%rax),%%xmm2	\n\t"/* frac = fabs(frac) */\
		"mulpd		%%xmm15,%%xmm5		\n\t"\
		"addpd		(%%rbx),%%xmm5		\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t"/* Note serialization here! */\
		"movaps		%%xmm5,%%xmm2		\n\t"/* cpy temp */\
		"mulpd		 0x10(%%rcx),%%xmm2	\n\t"/* temp*baseinv[0] */\
		"addpd		%%xmm7,%%xmm2		\n\t"\
		"subpd		%%xmm7,%%xmm2		\n\t"/* [cx,cy] = DNINT(temp*baseinv[0]) */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* cpy [cx,cy] */\
		"mulpd		     (%%rcx),%%xmm3	\n\t"/* [cx,cy]*base[0] */\
		"subpd		%%xmm3,%%xmm5		\n\t"/* xmm5 = [b.re,b.im] = temp-[cx,cy]*base[0] */\
		"movaps		%%xmm2,(%%rbx)		\n\t"/* store cy_out */\
		"movaps		%%xmm4,%%xmm2		\n\t"/* [a.re,a.im] copy */\
		"shufpd	$0,	%%xmm5,%%xmm4		\n\t"/* x = [a.re,b.re] */\
		"shufpd	$3,	%%xmm5,%%xmm2		\n\t"/* y = [a.im,b.im] */\
		"movaps		%%xmm4,%%xmm5		\n\t"/* x copy */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* y copy */\
		"movaps		%%xmm6,-0x20(%%rcx)	\n\t"/* Store maxerr */\
		/* Forward weight is (wt_re, +wt_im): */\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [x     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm3		\n\t"/* [y copy]*wt_im */\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [y     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm5		\n\t"/* [x copy]*wt_im */\
		"subpd		%%xmm3,%%xmm4		\n\t"/* rt = x*wt_re - y*wt_im */\
		"addpd		%%xmm2,%%xmm5		\n\t"/* it = x*wt_im + y*wt_re */\
		"movaps		%%xmm4,    (%%rdx)	\n\t"/* store rt = ~[a.re,b.re] */\
		"movaps		%%xmm5,0x10(%%rdx)	\n\t"/* store it = ~[a.im,b.im] */\
		/* Prepare for next pair of complex data: */\
		"movslq	%[__idx_incr],%%rdi		\n\t"\
		"addq	%%rdi,%%rsi				\n\t"/* idx_offset += idx_incr */\
		"mov	%%esi, %[__idx_offset]	\n\t"/* Store incremented idx_offset */\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		/* Prefetch address */\
		,	[__add0] "m" (Xadd0)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Same power-of-2-transform deal as above, but use xmm8-15 to process 2 sets of carries side-by-side.
	Data/Carry #2 assumed offset by +0x20/0x10 from #1 (which are accessed via the [__data/__cy] pointers, resp.):
	*/
	#define SSE2_fermat_carry_norm_pow2_errcheck_X2(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xhalf_arr,Xsign_mask,Xadd1,Xadd2, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd	(%%rax),%%xmm15		\n\t	shufpd	$0,%%xmm15,%%xmm15	\n\t"/* prp_mult, broadcast to all double-slots of xmm15 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"prefetcht0	(%%r14)	\n\t"\
		/* lcol -> rcol index analogs: [rsi,rax,rbx] -> [r10,r11,r12], [rcx,rdx,rdi] shared */\
		"movslq	%[__idx_offset],%%rsi	\n\t		movslq	%[__idx_incr],%%r10		\n\t"\
		"movslq		%[__nrt_bits],%%rcx	\n\t		addq	%%rsi,%%r10				\n\t"\
		"movslq		%[__nrtm1],%%rdi	/* r10 contains idx_offset2, i.e. is the rcol-analog of rsi in lcol: */\n\t"\
		"movq		%%rsi,%%rax			\n\t		movq		%%r10,%%r11			\n\t"\
		"shrq		$1,%%rax			\n\t		shrq		$1,%%r11			\n\t"\
		"movq		%%rax,%%rbx			\n\t		movq		%%r11,%%r12			\n\t"\
		"andq		%%rdi,%%rax			\n\t		andq		%%rdi,%%r11			\n\t"\
		"shrq		%%cl,%%rbx			\n\t		shrq		%%cl,%%r12			\n\t"\
		"shlq		$4,%%rax			\n\t		shlq		$4,%%r11			\n\t"\
		"shlq		$4,%%rbx			\n\t		shlq		$4,%%r12			\n\t"\
		"addq		%[__add1],%%rax		\n\t		addq		%[__add1],%%r11		\n\t"\
		"addq		%[__add2],%%rbx		\n\t		addq		%[__add2],%%r12		\n\t"\
		"movaps		(%%rax),%%xmm0		\n\t		movaps		(%%r11),%%xmm8 		\n\t"\
		"movaps		(%%rbx),%%xmm1		\n\t		movaps		(%%r12),%%xmm9 		\n\t"\
		"movq		%%rsi,%%rax			\n\t		movq		%%r10,%%r11			\n\t"\
		"movaps		%%xmm1,%%xmm2		\n\t		movaps		%%xmm9 ,%%xmm10		\n\t"\
		"shufpd	$1,	%%xmm2,%%xmm2		\n\t		shufpd	$1,	%%xmm10,%%xmm10		\n\t"\
		"mulpd		%%xmm0,%%xmm1		\n\t		mulpd		%%xmm8 ,%%xmm9 		\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"addq		$2,%%rax			\n\t		addq		$2,%%r11			\n\t"\
		"shrq		$1,%%rax			\n\t		shrq		$1,%%r11			\n\t"\
		"movq		%%rax,%%rbx			\n\t		movq		%%r11,%%r12			\n\t"\
		"andq		%%rdi,%%rax			\n\t		andq		%%rdi,%%r11			\n\t"\
		"shrq		%%cl,%%rbx			\n\t		shrq		%%cl,%%r12			\n\t"\
		"shlq		$4,%%rax			\n\t		shlq		$4,%%r11			\n\t"\
		"shlq		$4,%%rbx			\n\t		shlq		$4,%%r12			\n\t"\
		"addq		%[__add1],%%rax		\n\t		addq		%[__add1],%%r11		\n\t"\
		"addq		%[__add2],%%rbx		\n\t		addq		%[__add2],%%r12		\n\t"\
		"movaps		(%%rax),%%xmm0		\n\t		movaps		(%%r11),%%xmm8 		\n\t"\
		"movaps		(%%rbx),%%xmm3		\n\t		movaps		(%%r12),%%xmm11		\n\t"\
		"movq		%%rsi,%%rax			\n\t		movq		%%r10,%%r11			\n\t"\
		"movaps		%%xmm3,%%xmm4		\n\t		movaps		%%xmm11,%%xmm12		\n\t"\
		"shufpd	$1,	%%xmm4,%%xmm4		\n\t		shufpd	$1,	%%xmm12,%%xmm12		\n\t"\
		"mulpd		%%xmm0,%%xmm3		\n\t		mulpd		%%xmm8 ,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"movaps		%%xmm1,%%xmm0		\n\t		movaps		%%xmm9 ,%%xmm8 		\n\t"\
		"unpcklpd	%%xmm3,%%xmm0		\n\t		unpcklpd	%%xmm11,%%xmm8 		\n\t"\
		"unpckhpd	%%xmm3,%%xmm1		\n\t		unpckhpd	%%xmm11,%%xmm9 		\n\t"\
		"subpd		%%xmm1,%%xmm0		\n\t		subpd		%%xmm9 ,%%xmm8 		\n\t"\
		"movaps		%%xmm2,%%xmm1		\n\t		movaps		%%xmm10,%%xmm9 		\n\t"\
		"unpcklpd	%%xmm4,%%xmm1		\n\t		unpcklpd	%%xmm12,%%xmm9 		\n\t"\
		"unpckhpd	%%xmm4,%%xmm2		\n\t		unpckhpd	%%xmm12,%%xmm10		\n\t"\
		"addpd		%%xmm2,%%xmm1		\n\t		addpd		%%xmm10,%%xmm9 		\n\t"\
		"movq		%[__half_arr],%%rcx	/* rcx shared, has same offset lcol/rcol: */\n\t"\
		"movaps		-0x10(%%rcx),%%xmm7	\n\t"/* sse2_rnd */\
		"movq		%[__data],%%rdx		/* rdx shared, offset +0x20 in rcol: */		\n\t"\
		"movaps		    (%%rdx),%%xmm4	\n\t		movaps		0x20(%%rdx),%%xmm12	\n\t"\
		"movaps		0x10(%%rdx),%%xmm2	\n\t		movaps		0x30(%%rdx),%%xmm10	\n\t"\
		"movaps		0x20(%%rcx),%%xmm5	\n\t		movaps		0x20(%%rcx),%%xmm13	\n\t"\
		"mulpd		%%xmm5,%%xmm4		\n\t		mulpd		%%xmm13,%%xmm12		\n\t"\
		"mulpd		%%xmm5,%%xmm2		\n\t		mulpd		%%xmm13,%%xmm10		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"mulpd		%%xmm1,%%xmm3		\n\t		mulpd		%%xmm9 ,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"mulpd		%%xmm1,%%xmm5		\n\t		mulpd		%%xmm9 ,%%xmm13		\n\t"\
		"addpd		%%xmm3,%%xmm4		\n\t		addpd		%%xmm11,%%xmm12		\n\t"\
		"subpd		%%xmm5,%%xmm2		\n\t		subpd		%%xmm13,%%xmm10		\n\t"\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq		%[__cy],%%rbx	/* rbx -> rbx+0x10 (carry offset only half of data-offset) in rcol, shared from here */	\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"\
		"shufpd	$0,	%%xmm2,%%xmm4		\n\t		shufpd	$0,	%%xmm10,%%xmm12		\n\t"\
		"shufpd	$3,	%%xmm2,%%xmm5		\n\t		shufpd	$3,	%%xmm10,%%xmm13		\n\t"\
		"movaps		-0x20(%%rcx),%%xmm6	\n\t	movaps	-0x20(%%rcx),%%xmm14	/* Use 2 copies of maxerr, merge at end */\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"\
		"addpd		%%xmm7	,%%xmm4		\n\t		addpd		%%xmm7	,%%xmm12	\n\t"\
		"subpd		%%xmm7	,%%xmm4		\n\t		subpd		%%xmm7	,%%xmm12	\n\t"\
	/*	roundpd	$0,%%xmm4,%%xmm4		\n\t		roundpd	$0,%%xmm12,%%xmm12		\n\t*/\
		"movq		%[__sign_mask],%%rax/* rax shared between lcol/rcol from here */\n\t"\
		"subpd		%%xmm4,%%xmm2		\n\t		subpd		%%xmm12,%%xmm10		\n\t"\
		"andpd		     (%%rax),%%xmm2	\n\t		andpd		     (%%rax),%%xmm10\n\t"\
		"mulpd		%%xmm15,%%xmm4		\n\t		mulpd		%%xmm15,%%xmm12		\n\t"\
		"addpd		(%%rbx),%%xmm4		\n\t		addpd	0x10(%%rbx),%%xmm12		\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t		maxpd		%%xmm14,%%xmm10		\n\t"\
		"movaps		%%xmm2,%%xmm6		\n\t		movaps		%%xmm10,%%xmm14		\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"\
		"mulpd		0x10(%%rcx),%%xmm2	\n\t		mulpd		0x10(%%rcx),%%xmm10	\n\t"\
		"addpd		%%xmm7	,%%xmm2		\n\t		addpd		%%xmm7	,%%xmm10	\n\t"\
		"subpd		%%xmm7	,%%xmm2		\n\t		subpd		%%xmm7	,%%xmm10	\n\t"\
	/*	roundpd	$0,%%xmm2,%%xmm2		\n\t		roundpd	$0,%%xmm10,%%xmm10		\n\t*/\
		"movaps		%%xmm2,    (%%rbx)	\n\t		movaps		%%xmm10,0x10(%%rbx)	\n\t"/* save carry to mem */\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"\
		"mulpd		    (%%rcx),%%xmm3	\n\t		mulpd		    (%%rcx),%%xmm11	\n\t"\
		"subpd		%%xmm3,%%xmm4		\n\t		subpd		%%xmm11,%%xmm12		\n\t"\
		"movaps		%%xmm5,%%xmm2		\n\t		movaps		%%xmm13,%%xmm10		\n\t"\
		"addpd		%%xmm7	,%%xmm5		\n\t		addpd		%%xmm7	,%%xmm13	\n\t"\
		"subpd		%%xmm7	,%%xmm5		\n\t		subpd		%%xmm7	,%%xmm13	\n\t"\
	/*	roundpd	$0,%%xmm5,%%xmm5		\n\t		roundpd	$0,%%xmm13,%%xmm13		\n\t*/\
		"subpd		%%xmm5,%%xmm2		\n\t		subpd		%%xmm13,%%xmm10		\n\t"\
		"andpd		     (%%rax),%%xmm2	\n\t		andpd		     (%%rax),%%xmm10\n\t"\
		"mulpd		%%xmm15,%%xmm5		\n\t		mulpd		%%xmm15,%%xmm13		\n\t"\
		"addpd		    (%%rbx),%%xmm5	\n\t		addpd		0x10(%%rbx),%%xmm13	\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t		maxpd		%%xmm14,%%xmm10		\n\t"\
		"movaps		%%xmm2,%%xmm6		\n\t		movaps		%%xmm10,%%xmm14		\n\t"\
		"movaps		%%xmm5,%%xmm2		\n\t		movaps		%%xmm13,%%xmm10		\n\t"\
		"mulpd		 0x10(%%rcx),%%xmm2	\n\t		mulpd		 0x10(%%rcx),%%xmm10\n\t"\
		"addpd		%%xmm7	,%%xmm2		\n\t		addpd		%%xmm7	,%%xmm10	\n\t"\
		"subpd		%%xmm7	,%%xmm2		\n\t		subpd		%%xmm7	,%%xmm10	\n\t"\
	/*	roundpd	$0,%%xmm2,%%xmm2		\n\t		roundpd	$0,%%xmm10,%%xmm10		\n\t*/\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"\
		"mulpd		     (%%rcx),%%xmm3	\n\t		mulpd		     (%%rcx),%%xmm11\n\t"\
		"subpd		%%xmm3,%%xmm5		\n\t		subpd		%%xmm11,%%xmm13		\n\t"\
		"movaps		%%xmm2,(%%rbx)		\n\t		movaps		%%xmm10,0x10(%%rbx)	\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"\
		"shufpd	$0,	%%xmm5,%%xmm4		\n\t		shufpd	$0,	%%xmm13,%%xmm12		\n\t"\
		"shufpd	$3,	%%xmm5,%%xmm2		\n\t		shufpd	$3,	%%xmm13,%%xmm10		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"mulpd		%%xmm1,%%xmm3		\n\t		mulpd		%%xmm9 ,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"mulpd		%%xmm1,%%xmm5		\n\t		mulpd		%%xmm9 ,%%xmm13		\n\t"\
		"subpd		%%xmm3,%%xmm4		\n\t		subpd		%%xmm11,%%xmm12		\n\t"\
		"addpd		%%xmm2,%%xmm5		\n\t		addpd		%%xmm10,%%xmm13		\n\t"\
		"movaps		%%xmm4,    (%%rdx)	\n\t		movaps		%%xmm12,0x20(%%rdx)	\n\t"\
		"movaps		%%xmm5,0x10(%%rdx)	\n\t		movaps		%%xmm13,0x30(%%rdx)	\n\t"\
		"/* Store larger of maxerr1,2: */	\n\t	movslq	%[__idx_incr],%%rdi		\n\t"\
		"maxpd		%%xmm14,%%xmm6		\n\t		addq	%%r10,%%rdi				\n\t"\
		"movaps		%%xmm6,-0x20(%%rcx)	\n\t		mov	%%edi, %[__idx_offset]	/* Store twice-incremented idx_offset */	\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1] "m" (Xp1)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r10","r11","r12","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Non-power-of-2-runlength Fermat-mod acyclic-transform/IBDWT carry macro.
	The array indices icycle0/1 declared int in caller but assumed to have been << 4 at time this macro called, thus can use as complex-address-offsets.
	*/
	#define SSE2_fermat_carry_norm_errcheck(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xodd_radix,Xhalf_arr,Xsign_mask,Xadd1,Xadd2,Xicycle0,Xjcycle0, Xadd0, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd	(%%rax),%%xmm15		\n\t	shufpd	$0,%%xmm15,%%xmm15	\n\t"/* prp_mult, broadcast to all double-slots of xmm15 */\
		"mulpd		%%xmm15,%%xmm@		\n\t"\
		"addpd		(%%rbx),%%xmm@		\n\t"/* temp = temp*prp_mult + cy */\
	"movq	%[__add0],%%r14	\n\t"\
	"prefetcht0	(%%r14)	\n\t"\
		"movslq	%[__idx_offset],%%rsi	\n\t"/* esi stores [j + idx_offset], idx_offset starts = 0, gets incremented by idx_incr each macro invocation */\
		"movslq %[__odd_radix],%%rdi	\n\t"/* [1,2,3]*odd_radix are the index offsets to the wtinv, base, and base_inv values, respectively. */\
		"movslq	%[__nrt_bits],%%rcx		\n\t"\
		"movslq %[__icycle0],%%r10		\n\t"\
		"movslq	%[__jcycle0],%%r11		\n\t"\
		"movq		%%rsi,%%rax			\n\t"/* j + idx_offset */\
		"shrq		$1,%%rax			\n\t"/* l = ((j + idx_offset) >> 1) */\
		"movq		%%rax,%%rbx			\n\t"\
		"andq		%[__nrtm1],%%rax	\n\t"/* k1 = (l & __NRTM1) */\
		"shrq		%%cl,%%rbx			\n\t"/* k2=(l >> __NRT_BITS) */\
		"shlq		$4,%%rax			\n\t"/* 16 bytes for array-of-complex */\
		"shlq		$4,%%rbx			\n\t"/* 16 bytes for array-of-complex */\
		"shlq		$4,%%rdi			\n\t"/* 16 bytes for array-of-complex */\
		"addq		%[__add1],%%rax		\n\t"/* rn0[k1] */\
		"addq		%[__add2],%%rbx		\n\t"/* rn1[k2] */\
		"movaps		(%%rax),%%xmm0		\n\t"/* [c0,s0] */\
		"movaps		(%%rbx),%%xmm1		\n\t"/* [x0,y0] */\
		"movq		%%rsi,%%rax			\n\t"\
		"movaps		%%xmm1,%%xmm2		\n\t"/* [x0,y0] copy */\
		"shufpd	$1,	%%xmm2,%%xmm2		\n\t"/* [y0,x0] (swap re <--> im) */\
		"mulpd		%%xmm0,%%xmm1		\n\t"/* [c0.x0,s0.y0] */\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [c0.y0,s0.x0] 1,2 used */\
		/* Get next root for interleaving with the first: */\
		"addq		$2,%%rax			\n\t"\
		"shrq		$1,%%rax			\n\t"/* l = ((j + idx_offset) >> 1) */\
		"movq		%%rax,%%rbx			\n\t"\
		"andq		%[__nrtm1],%%rax	\n\t"/* k1 = (l & __NRTM1) */\
		"shrq		%%cl,%%rbx			\n\t"/* k2=(l >> __NRT_BITS) */\
		"shlq		$4,%%rax			\n\t"/* 16 bytes for array-of-complex */\
		"shlq		$4,%%rbx			\n\t"/* 16 bytes for array-of-complex */\
		"addq		%[__add1],%%rax		\n\t"/* rn0[k1] */\
		"addq		%[__add2],%%rbx		\n\t"/* rn1[k2] */\
		"movaps		(%%rax),%%xmm0		\n\t"/* [c1,s1] */\
		"movaps		(%%rbx),%%xmm3		\n\t"/* [x1,y1] 0-3 used*/\
		"movq		%%rsi,%%rax			\n\t"\
		"movaps		%%xmm3,%%xmm4		\n\t"/* [x1,y1] copy */\
		"shufpd	$1,	%%xmm4,%%xmm4		\n\t"/* [y1,x1] (swap re <--> im) */\
		"mulpd		%%xmm0,%%xmm3		\n\t"/* [c1.x1,s1.y1] */\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [c1.y1,s1.x1] 1-4 used */\
		"movaps		%%xmm1,%%xmm0		\n\t"/* xmm0 <- copy [c0.x0,s0.y0] */\
		"unpcklpd	%%xmm3,%%xmm0		\n\t"/* [c0.x0,c1.x1] */\
		"unpckhpd	%%xmm3,%%xmm1		\n\t"/* [s0.y0,s1.y1], 0-2,4 used */\
		"subpd		%%xmm1,%%xmm0		\n\t"/* xmm0 = [wt_r0,wt_r1] 0,2,4 used */\
		"movaps		%%xmm2,%%xmm1		\n\t"/* xmm1 <- copy [c0.y0,s0.x0] 0-2,4 used */\
		"unpcklpd	%%xmm4,%%xmm1		\n\t"/* [c0.y0,c1.y1] */\
		"unpckhpd	%%xmm4,%%xmm2		\n\t"/* [s0.x0,s1.x1] */\
		"addpd		%%xmm2,%%xmm1		\n\t"/* xmm1 = [wt_i0,wt_i1] 0-1 used */\
		/* half_arr[0,1,2,3] = [base*2, baseinv*2,wt_re*2,wt_im*2] */\
		"movq		%[__half_arr],%%rcx	\n\t"/* No longer need __NRT_BITS, so reuse ecx */\
		/* Multiply the complex transform output [x,y] = [re,im] by the inverse IBDWT weight, which includes the scale factor: [x,y] *= wtinv: */\
		"movq		%[__data],%%rdx		\n\t"\
		"movaps		     (%%rdx),%%xmm4	\n\t"/* x = [a.re,b.re] */\
		"movaps		 0x10(%%rdx),%%xmm2	\n\t"/* y = [a.im,b.im] */\
		"addq		%%r10,%%rcx			\n\t"\
		"movaps	(%%rcx,%%rdi),%%xmm5	\n\t"/* [wtinv0,wtinv1] */\
		"subq		%%r10,%%rcx			\n\t"\
		"mulpd		%%xmm5,%%xmm4		\n\t"\
		"mulpd		%%xmm5,%%xmm2		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t"/* x copy */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* y copy */\
		/* Inverse weight is (wt_re, -wt_im): */\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [x     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm3		\n\t"/* [y copy]*wt_im */\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [y     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm5		\n\t"/* [x copy]*wt_im */\
		"addpd		%%xmm3,%%xmm4		\n\t"/* [a.re,b.re] = x*wt_re + y*wt_im */\
		"subpd		%%xmm5,%%xmm2		\n\t"/* [a.im,b.im] = y*wt_re - x*wt_im */\
		"movq		%[__cy],%%rbx		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t"/* [a.re,b.re] copy */\
		"shufpd	$0,	%%xmm2,%%xmm4		\n\t"/* xmm4 = x = [a.re,a.im] */\
		"shufpd	$3,	%%xmm2,%%xmm5		\n\t"/* xmm5 = y = [b.re,b.im] 0,1,4,5 uaed */\
		/* normalize a-pair, compute carryout, compute ROE: */\
		"addpd		     (%%rbx),%%xmm4	\n\t"/* [a.re,a.im] + [cx,cy] */\
		"movaps		-0x20(%%rcx),%%xmm6	\n\t"/* xmm6 = maxerr */\
		"movaps		-0x10(%%rcx),%%xmm7	\n\t"/* xmm7 = rnd_const */\
		"addq	   %%r10,%%rcx			\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t"/* copy x */\
		"shlq	   $1,%%rdi				\n\t"\
		"addpd		%%xmm7,%%xmm4		\n\t"\
		"subpd		%%xmm7,%%xmm4		\n\t"/* temp = DNINT(x) */\
		"movq		%[__sign_mask],%%rax\n\t"\
		"subpd		%%xmm4,%%xmm2		\n\t"/* frac = [x - temp] */\
		"andpd		(%%rax),%%xmm2		\n\t"/* frac = fabs(frac) */\
		"maxpd		%%xmm6,%%xmm2		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t"/* Note serialization here! */\
		"addq		%%rdi,%%rcx			\n\t"\
		"shrq		$1,%%rdi			\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t"/* cpy temp */\
		"mulpd  (%%rcx,%%rdi),%%xmm2	\n\t"/* temp*baseinv[0] */\
		"addpd		%%xmm7,%%xmm2		\n\t"\
		"subpd		%%xmm7,%%xmm2		\n\t"/* [cx,cy] = DNINT(temp*baseinv[0]) */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* cpy [cx,cy] */\
		"mulpd		(%%rcx),%%xmm3		\n\t"/* [cx,cy]*base[0] */\
		"subq		%%r10,%%rcx			\n\t"\
		"subpd		%%xmm3,%%xmm4		\n\t"/* xmm4 = [a.re,a.im] = temp-[cx,cy]*base[0] */\
		/* Now do b-pair: [b.re,b.im] in xmm5, carry in xmm2, xmm3 free, wt_[re,im] in xmmA,B, xmm6 free, rnd_const in xmm7: */\
		"addpd		%%xmm2,%%xmm5		\n\t"/* [b.re,b.im] + [cx,cy] */\
		"movaps		%%xmm5,%%xmm2		\n\t"/* copy y */\
		"addpd		%%xmm7,%%xmm5		\n\t"\
		"subpd		%%xmm7,%%xmm5		\n\t"/* temp = DNINT(y) */\
		"subpd		%%xmm5,%%xmm2		\n\t"/* frac = [y - temp] */\
		"andpd		(%%rax),%%xmm2		\n\t"/* frac = fabs(frac) */\
		"maxpd		%%xmm6,%%xmm2		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t"/* Note serialization here! */\
		"movaps		%%xmm5,%%xmm2		\n\t"/* cpy temp */\
		"addq		%%r11,%%rcx			\n\t"\
		"mulpd  (%%rcx,%%rdi),%%xmm2	\n\t"/* temp*baseinv[1] */\
		"addpd		%%xmm7,%%xmm2		\n\t"\
		"subpd		%%xmm7,%%xmm2		\n\t"/* [cx,cy] = DNINT(temp*baseinv[1]) */\
		"shlq		$1,%%rdi			\n\t"/* prepare to re-subtract 2*odd_radix from local-store pointer */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* cpy [cx,cy] */\
		"mulpd		(%%rcx),%%xmm3		\n\t"/* [cx,cy]*base[1] */\
		"subq		%%r11,%%rcx			\n\t"\
		"subpd		%%xmm3,%%xmm5		\n\t"/* xmm5 = [b.re,b.im] = temp-[cx,cy]*base[1] */\
		"movaps		%%xmm2,(%%rbx)		\n\t"/* store cy_out */\
		"movaps		%%xmm4,%%xmm2		\n\t"/* [a.re,a.im] copy */\
		"shufpd	$0,	%%xmm5,%%xmm4		\n\t"/* x = [a.re,b.re] */\
		"shufpd	$3,	%%xmm5,%%xmm2		\n\t"/* y = [a.im,b.im] */\
		"movaps		%%xmm4,%%xmm5		\n\t"/* x copy */\
		"movaps		%%xmm2,%%xmm3		\n\t"/* y copy */\
		/* Forward acyclic-convo weight is (wt_re, +wt_im): */\
		"subq		%%rdi,%%rcx			\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t"/* [x     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm3		\n\t"/* [y copy]*wt_im */\
		"movaps		%%xmm6,-0x20(%%rcx)	\n\t"/* Store maxerr */\
		"addq		%%r10,%%rcx			\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t"/* [y     ]*wt_re */\
		"mulpd		%%xmm1,%%xmm5		\n\t"/* [x copy]*wt_im */\
		"movaps		(%%rcx),%%xmm0		\n\t"/* [wt0,wt1] */\
		"subpd		%%xmm3,%%xmm4		\n\t"/* rt = x*wt_re - y*wt_im */\
		"addpd		%%xmm2,%%xmm5		\n\t"/* it = x*wt_im + y*wt_re */\
		/* Forward IBDWT weight: */\
		"mulpd		%%xmm0,%%xmm4		\n\t"\
		"mulpd		%%xmm0,%%xmm5		\n\t"\
		"movaps		%%xmm4,    (%%rdx)	\n\t"/* store rt = ~[a.re,b.re] */\
		"movaps		%%xmm5,0x10(%%rdx)	\n\t"/* store it = ~[a.im,b.im] */\
		/* Prepare for next pair of complex data: */\
		"addq	%[__idx_incr],%%rsi		\n\t"/* idx_offset += idx_incr */\
		"mov	%%esi, %[__idx_offset]	\n\t"/* Store incremented idx_offset */\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__odd_radix]   "m" (Xodd_radix)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		,	[__icycle0]		"m" (Xicycle0)\
		,	[__jcycle0]		"m" (Xjcycle0)\
		/* Prefetch address */\
		,	[__add0] "m" (Xadd0)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r10","r11","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Same non-power-of-2-transform deal as above, but use xmm8-15 to process 2 sets of carries side-by-side.
	Data/Carry #2 assumed offset by +0x20/0x10 from #1 (which are accessed via the [__data/__cy] pointers, resp.)
	i/jcycle0 and i/jcycle1 are the address offsets needed for IBDWT array indexing for the 2 resp. carries. */
	#define SSE2_fermat_carry_norm_errcheck_X2(Xdata,Xcy,Xnrt_bits,Xnrtm1,Xidx_offset,Xidx_incr,Xodd_radix,Xhalf_arr,Xsign_mask,Xadd1,Xadd2,Xicycle0,Xjcycle0,Xicycle1,Xjcycle1, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult],%%rax	\n\t"\
		"movsd	(%%rax),%%xmm15		\n\t	shufpd	$0,%%xmm15,%%xmm15	\n\t"/* prp_mult, broadcast to all double-slots of xmm15 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"prefetcht0	(%%r14)	\n\t"\
		/* lcol -> rcol index analogs: [rsi,rax,rbx] -> [r10,r11,r12], [rcx,rdx,rdi] shared */\
		"movslq	%[__idx_offset],%%rax	\n\t		movslq	%[__idx_incr],%%r10		\n\t"\
		"movslq		%[__nrt_bits],%%rcx	\n\t		addq	%%rax,%%r10				\n\t"\
		"movslq		%[__nrtm1],%%rdi	\n\t		movq		%%r10,%%r11	\n\t"/* r10 contains idx_offset2, i.e. is the rcol-analog of idx_offset1 in lcol: */\
		"shrq		$1,%%rax			\n\t		shrq		$1,%%r11			\n\t"\
		"movq		%%rax,%%rbx			\n\t		movq		%%r11,%%r12			\n\t"\
		"andq		%%rdi,%%rax			\n\t		andq		%%rdi,%%r11			\n\t"\
		"shrq		%%cl,%%rbx			\n\t		shrq		%%cl,%%r12			\n\t"\
		"shlq		$4,%%rax			\n\t		shlq		$4,%%r11			\n\t"\
		"shlq		$4,%%rbx			\n\t		shlq		$4,%%r12			\n\t"\
		"addq		%[__add1],%%rax		\n\t		addq		%[__add1],%%r11		\n\t"\
		"addq		%[__add2],%%rbx		\n\t		addq		%[__add2],%%r12		\n\t"\
		"movaps		(%%rax),%%xmm0		\n\t		movaps		(%%r11),%%xmm8 		\n\t"\
		"movaps		(%%rbx),%%xmm1		\n\t		movaps		(%%r12),%%xmm9 		\n\t"\
		"movslq	%[__idx_offset],%%rax	\n\t		movq		%%r10,%%r11			\n\t"\
		"movaps		%%xmm1,%%xmm2		\n\t		movaps		%%xmm9 ,%%xmm10		\n\t"\
		"shufpd	$1,	%%xmm2,%%xmm2		\n\t		shufpd	$1,	%%xmm10,%%xmm10		\n\t"\
		"mulpd		%%xmm0,%%xmm1		\n\t		mulpd		%%xmm8 ,%%xmm9 		\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"addq		$2,%%rax			\n\t		addq		$2,%%r11			\n\t"\
		"shrq		$1,%%rax			\n\t		shrq		$1,%%r11			\n\t"\
		"movq		%%rax,%%rbx			\n\t		movq		%%r11,%%r12			\n\t"\
		"andq		%%rdi,%%rax			\n\t		andq		%%rdi,%%r11			\n\t"\
		"shrq		%%cl,%%rbx			\n\t		shrq		%%cl,%%r12			\n\t"\
		"shlq		$4,%%rax			\n\t		shlq		$4,%%r11			\n\t"\
		"shlq		$4,%%rbx			\n\t		shlq		$4,%%r12			\n\t"\
		"addq		%[__add1],%%rax		\n\t		addq		%[__add1],%%r11		\n\t"\
		"addq		%[__add2],%%rbx		\n\t		addq		%[__add2],%%r12		\n\t"\
		"movaps		(%%rax),%%xmm0		\n\t		movaps		(%%r11),%%xmm8 		\n\t"\
		"movaps		(%%rbx),%%xmm3		\n\t		movaps		(%%r12),%%xmm11		\n\t"\
		"movslq	%[__idx_offset],%%rax	\n\t		movq		%%r10,%%r11			\n\t"\
		"movaps		%%xmm3,%%xmm4		\n\t		movaps		%%xmm11,%%xmm12		\n\t"\
		"shufpd	$1,	%%xmm4,%%xmm4		\n\t		shufpd	$1,	%%xmm12,%%xmm12		\n\t"\
		"mulpd		%%xmm0,%%xmm3		\n\t		mulpd		%%xmm8 ,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"movaps		%%xmm1,%%xmm0		\n\t		movaps		%%xmm9 ,%%xmm8 		\n\t"\
		"unpcklpd	%%xmm3,%%xmm0		\n\t		unpcklpd	%%xmm11,%%xmm8 		\n\t"\
		"unpckhpd	%%xmm3,%%xmm1		\n\t		unpckhpd	%%xmm11,%%xmm9 		\n\t"\
		"subpd		%%xmm1,%%xmm0		\n\t		subpd		%%xmm9 ,%%xmm8 		\n\t"\
		"movaps		%%xmm2,%%xmm1		\n\t		movaps		%%xmm10,%%xmm9 		\n\t"\
		"unpcklpd	%%xmm4,%%xmm1		\n\t		unpcklpd	%%xmm12,%%xmm9 		\n\t"\
		"unpckhpd	%%xmm4,%%xmm2		\n\t		unpckhpd	%%xmm12,%%xmm10		\n\t"\
		"addpd		%%xmm2,%%xmm1		\n\t		addpd		%%xmm10,%%xmm9 		\n\t"\
		"/* Store twice-incremented idx_offset to free up registers rdi,r10,r11: */	\n\t"\
		"											movslq	%[__idx_incr],%%rdi		\n\t"\
		"											addq	%%r10,%%rdi				\n\t"\
		"											mov	%%edi, %[__idx_offset]		\n\t"\
		"movslq	%[__icycle0],%%r8 		\n\t		movslq	%[__icycle1],%%r10		\n\t"\
		"movslq	%[__odd_radix],%%rdi	\n\t"\
		"movq		%[__half_arr],%%rcx	/* Need separate rcol copy of rcx below */	\n\t"\
		"movaps		-0x10(%%rcx),%%xmm7	\n\t"/* sse2_rnd */\
		"movq		%[__data],%%rdx		/* rdx shared, offset +0x20 in rcol: */		\n\t"\
		"shlq		$4,%%rdi			\n\t		movq		%%rcx,%%r12	/* rcol-copy for incr/decr: */\n\t"\
		"movaps		    (%%rdx),%%xmm4	\n\t		movaps		0x20(%%rdx),%%xmm12	\n\t"\
		"movaps		0x10(%%rdx),%%xmm2	\n\t		movaps		0x30(%%rdx),%%xmm10	\n\t"\
		"addq		%%r8 ,%%rcx			\n\t		addq		%%r10,%%r12			\n\t"\
		"movaps	(%%rcx,%%rdi),%%xmm5	\n\t		movaps	(%%r12,%%rdi),%%xmm13	\n\t"\
		"subq		%%r8 ,%%rcx			\n\t		subq		%%r10,%%r12	/* rcx == r12 again */\n\t"\
		"mulpd		%%xmm5,%%xmm4		\n\t		mulpd		%%xmm13,%%xmm12		\n\t"\
		"mulpd		%%xmm5,%%xmm2		\n\t		mulpd		%%xmm13,%%xmm10		\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"mulpd		%%xmm1,%%xmm3		\n\t		mulpd		%%xmm9 ,%%xmm11		\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"mulpd		%%xmm1,%%xmm5		\n\t		mulpd		%%xmm9 ,%%xmm13		\n\t"\
		"addpd		%%xmm3,%%xmm4		\n\t		addpd		%%xmm11,%%xmm12		\n\t"\
		"subpd		%%xmm5,%%xmm2		\n\t		subpd		%%xmm13,%%xmm10		\n\t"\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq		%[__cy],%%rbx	/* rbx -> rbx+0x10 (carry offset only half of data-offset) in rcol, shared from here */	\n\t"\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"\
		"shufpd	$0,	%%xmm2,%%xmm4		\n\t		shufpd	$0,	%%xmm10,%%xmm12		\n\t"\
		"shufpd	$3,	%%xmm2,%%xmm5		\n\t		shufpd	$3,	%%xmm10,%%xmm13		\n\t"\
		"movaps		-0x20(%%rcx),%%xmm6	/* maxerr, will make rcol-copy below, re-merge at end */\n\t"\
		"addq		%%r8 ,%%rcx			\n\t		addq		%%r10,%%r12			\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"\
		"shlq		$1,%%rdi			\n\t		movaps		%%xmm6,%%xmm14	/* rcol-copy of maxerr */\n\t"\
		"addpd		%%xmm7	,%%xmm4		\n\t		addpd		%%xmm7	,%%xmm12	\n\t"\
		"subpd		%%xmm7	,%%xmm4		\n\t		subpd		%%xmm7	,%%xmm12	\n\t"/* temp = DNINT(x) */\
	/*	roundpd	$0,%%xmm4,%%xmm4		\n\t		roundpd	$0,%%xmm12,%%xmm12		\n\t*/\
		"movq		%[__sign_mask],%%rax/* rax shared between lcol/rcol from here */\n\t"\
		"subpd		%%xmm4,%%xmm2		\n\t		subpd		%%xmm12,%%xmm10		\n\t"/* frac = [x - temp] */\
		"andpd		(%%rax),%%xmm2		\n\t		andpd		(%%rax),%%xmm10		\n\t"/* frac = fabs(frac) */\
		"mulpd		%%xmm15,%%xmm4		\n\t		mulpd		%%xmm15,%%xmm12		\n\t"\
		"addpd		(%%rbx),%%xmm4		\n\t		addpd	0x10(%%rbx),%%xmm12		\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t		maxpd		%%xmm14,%%xmm10		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t		movaps		%%xmm10,%%xmm14		\n\t"\
		"addq		%%rdi,%%rcx			\n\t		addq		%%rdi,%%r12			\n\t"\
		"shrq		$1,%%rdi														\n\t"\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"/* cpy temp */\
		"mulpd	(%%rcx,%%rdi),%%xmm2	\n\t		mulpd	(%%r12,%%rdi),%%xmm10	\n\t"/* temp*baseinv[0] */\
		"addpd		%%xmm7	,%%xmm2		\n\t		addpd		%%xmm7	,%%xmm10	\n\t"\
		"subpd		%%xmm7	,%%xmm2		\n\t		subpd		%%xmm7	,%%xmm10	\n\t"/* cy = DNINT(temp*baseinv[0]) */\
	/*	roundpd	$0,%%xmm2,%%xmm2		\n\t		roundpd	$0,%%xmm10,%%xmm10		\n\t*/\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"/* cpy cy */\
		"mulpd		    (%%rcx),%%xmm3	\n\t		mulpd		    (%%r12),%%xmm11	\n\t"/* cy*base[0] */\
		"subq		%%r8 ,%%rcx			\n\t		subq		%%r10,%%r12			\n\t"\
		"subpd		%%xmm3,%%xmm4		\n\t		subpd		%%xmm11,%%xmm12		\n\t"/* xmm4 = [a.re,a.im] = temp-[cx,cy]*base[0] */\
		/* Now do b-pair: [b.re,b.im] in xmm5, carry in xmm2, xmm3 free, wt_[re,im] in xmmA,B, xmm6 free, rnd_const in xmm7: */\
		"movaps		%%xmm2,(%%rbx)		\n\t		movaps		%%xmm10,0x10(%%rbx)	\n\t"/* With prp_mult support, must write cy since xmm2,10 get overwritten next */\
		"movaps		%%xmm5,%%xmm2		\n\t		movaps		%%xmm13,%%xmm10		\n\t"/* cpy y */\
		"addpd		%%xmm7	,%%xmm5		\n\t		addpd		%%xmm7	,%%xmm13	\n\t"\
		"subpd		%%xmm7	,%%xmm5		\n\t		subpd		%%xmm7	,%%xmm13	\n\t"/* temp = DNINT(y) */\
	/*	roundpd	$0,%%xmm5,%%xmm5		\n\t		roundpd	$0,%%xmm13,%%xmm13		\n\t*/\
		"subpd		%%xmm5,%%xmm2		\n\t		subpd		%%xmm13,%%xmm10		\n\t"/* frac = [y - temp] */\
		"andpd		(%%rax),%%xmm2		\n\t		andpd		(%%rax),%%xmm10		\n\t"/* frac = fabs(frac) */\
		"mulpd		%%xmm15,%%xmm5		\n\t		mulpd		%%xmm15,%%xmm13		\n\t"\
		"addpd		(%%rbx),%%xmm5		\n\t		addpd	0x10(%%rbx),%%xmm13		\n\t"/* temp = temp*prp_mult + cy */\
		"maxpd		%%xmm6,%%xmm2		\n\t		maxpd		%%xmm14,%%xmm10		\n\t"/* if(frac > maxerr) maxerr=frac */\
		"movaps		%%xmm2,%%xmm6		\n\t		movaps		%%xmm10,%%xmm14		\n\t"\
		"movaps		%%xmm5,%%xmm2		\n\t		movaps		%%xmm13,%%xmm10		\n\t"/* cpy temp */\
		"movslq	%[__jcycle0],%%r8 		\n\t		movslq	%[__jcycle1],%%r10		\n\t"\
		"addq		%%r8 ,%%rcx			\n\t		addq		%%r10,%%r12			\n\t"\
		"maxpd		%%xmm14,%%xmm6		\n\t"/* Save larger of maxerr1,2: */\
		"mulpd	(%%rcx,%%rdi),%%xmm2	\n\t		mulpd	(%%r12,%%rdi),%%xmm10	\n\t"/* temp*baseinv[0] */\
		"addpd		%%xmm7	,%%xmm2		\n\t		addpd		%%xmm7	,%%xmm10	\n\t"\
		"subpd		%%xmm7	,%%xmm2		\n\t		subpd		%%xmm7	,%%xmm10	\n\t"/* cy = DNINT(temp*baseinv[0]) */\
	/*	roundpd	$0,%%xmm2,%%xmm2		\n\t		roundpd	$0,%%xmm10,%%xmm10		\n\t*/\
		"shlq		$1,%%rdi														\n\t"\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"/* cpy cy */\
		"mulpd		(%%rcx),%%xmm3		\n\t		mulpd		(%%r12),%%xmm11		\n\t"/* cy*base[0] */\
		"subq		%%r8 ,%%rcx			\n\t		subq		%%r10,%%r12			\n\t"\
		"subpd		%%xmm3,%%xmm5		\n\t		subpd		%%xmm11,%%xmm13		\n\t"/* xmm5 = [b.re,b.im] = temp-[cx,cy]*base[0] */\
		"movaps		%%xmm2,(%%rbx)		\n\t		movaps		%%xmm10,0x10(%%rbx)	\n\t"/* store cy_out */\
		"movaps		%%xmm4,%%xmm2		\n\t		movaps		%%xmm12,%%xmm10		\n\t"/* [a.re,a.im] copy */\
		"shufpd	$0,	%%xmm5,%%xmm4		\n\t		shufpd	$0,	%%xmm13,%%xmm12		\n\t"/* x = [a.re,b.re] */\
		"shufpd	$3,	%%xmm5,%%xmm2		\n\t		shufpd	$3,	%%xmm13,%%xmm10		\n\t"/* y = [a.im,b.im] */\
		"movaps		%%xmm4,%%xmm5		\n\t		movaps		%%xmm12,%%xmm13		\n\t"/* x copy */\
		"movaps		%%xmm2,%%xmm3		\n\t		movaps		%%xmm10,%%xmm11		\n\t"/* y copy */\
		"subq		%%rdi,%%rcx			\n\t		subq		%%rdi,%%r12	/* rcx == r12 again */\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"mulpd		%%xmm1,%%xmm3		\n\t		mulpd		%%xmm9 ,%%xmm11		\n\t"\
		"movaps		%%xmm6,-0x20(%%rcx)	\n\t"/* Store maxerr: */\
		"movslq	%[__icycle0],%%r8 		\n\t		movslq	%[__icycle1],%%r10		\n\t"\
		"addq		%%r8 ,%%rcx			\n\t		addq		%%r10,%%r12			\n\t"\
		"mulpd		%%xmm0,%%xmm2		\n\t		mulpd		%%xmm8 ,%%xmm10		\n\t"\
		"mulpd		%%xmm1,%%xmm5		\n\t		mulpd		%%xmm9 ,%%xmm13		\n\t"\
		"movaps		(%%rcx),%%xmm0		\n\t		movaps		(%%r12),%%xmm8 		\n\t"\
		"subpd		%%xmm3,%%xmm4		\n\t		subpd		%%xmm11,%%xmm12		\n\t"\
		"addpd		%%xmm2,%%xmm5		\n\t		addpd		%%xmm10,%%xmm13		\n\t"\
		"mulpd		%%xmm0,%%xmm4		\n\t		mulpd		%%xmm8 ,%%xmm12		\n\t"\
		"mulpd		%%xmm0,%%xmm5		\n\t		mulpd		%%xmm8 ,%%xmm13		\n\t"\
		"movaps		%%xmm4,    (%%rdx)	\n\t		movaps		%%xmm12,0x20(%%rdx)	\n\t"\
		"movaps		%%xmm5,0x10(%%rdx)	\n\t		movaps		%%xmm13,0x30(%%rdx)	\n\t"\
		:						/* outputs: none */\
		:	[__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		,	[__cy]			"m" (Xcy)\
		,	[__nrt_bits]	"m" (Xnrt_bits)\
		,	[__nrtm1]		"m" (Xnrtm1)\
		,	[__idx_offset]	"m" (Xidx_offset)\
		,	[__idx_incr]	"m" (Xidx_incr)\
		,	[__odd_radix]	"m" (Xodd_radix)\
		,	[__half_arr]	"m" (Xhalf_arr)\
		,	[__sign_mask]	"m" (Xsign_mask)\
		,	[__add1]		"m" (Xadd1)\
		,	[__add2]		"m" (Xadd2)\
		,	[__icycle0]		"m" (Xicycle0)\
		,	[__jcycle0]		"m" (Xjcycle0)\
		,	[__icycle1]		"m" (Xicycle1)\
		,	[__jcycle1]		"m" (Xjcycle1)\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		/* v18: Needed to support residue-shifted Pepin tests: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","r8","r10","r11","r12","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

#endif	// AVX or SSE2?

	/*************************************************************/
	/**************** MERSENNE-MOD CARRY MACROS ******************/
	/*************************************************************/

// These LOACC wtsinit macros common to avx/avx2 builds ... avx-512 only supports 8-way:
#ifdef USE_AVX512

	/***********************************************************************************/
	/**** For AVX512-and-beyond we support only the fast [LOACC] Mers-carry macros. ****/
	/***********************************************************************************/

	// 16-fold analog of AVX_cmplx_carry_fast_pow2_wtsinit_X8:
	#define AVX_cmplx_carry_fast_pow2_wtsinit_X16(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
		/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
		Since we gets .5,.25 via bitfield-load-as-double, those use VPBROADCASTQ; rest use VBROADCASTSD-from-mem-address:
		[1] Fwd-wt multipliers: Init = 0.50 x 8, anytime AVX-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 8, anytime AVX-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"movq $0x3FE0000000000000,%%rsi	\n\t	vpbroadcastq  %%rsi ,%%zmm30\n\t"\
		"movq $0x3FD0000000000000,%%rdi	\n\t	vpbroadcastq  %%rdi ,%%zmm31\n\t"\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 8 doubles d0-7 in zmm */\
		"movq $0x0001020304050607,%%rsi	\n\t"/* 64-bit register w/byte offsets 7-0, bytes ordered left-to-right in decreasing significance */\
		"vmovq		%%rsi,%%xmm3 		\n\t"/* Copy byte pattern to low qword (64 bits) of zmm3 [NB: AVX-512 only supports MOVQ to/from 128-bit vector regs] */\
		"vpmovzxbq	%%xmm3,%%zmm3		\n\t"/* vector-permutation-index: zmm3 = [7,6,5,4,3,2,1,0] in qwords.
																zmm3 PERSISTENT FROM HERE TIL END OF MACRO */\
		"movq	%[__half_arr],%%rdi		\n\t"\
	/**********************************/\
	/* Do A.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"/* Init zmm8,9,24,25 in prep for conditional-doubling */\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%zmm0		\n\t"/* bjmod[0:15]. PERSISTENT COPY OF BJMOD[0:15] REMAINS IN zmm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%zmm1	\n\t"/* Broadcast n_minus_sil to all 16 slots of zmm1 */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"/* n_minus_sil[zmm1] >= bjmod[0:15][zmm0] ? Opmask K1 is bit-flipped-analog of AVX-mode bitmask stored in RCX */\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%zmm1	\n\t"/* Broadcast sinwt to all 16 slots of zmm1, then effect sinwt < bjmod[0:15][zmm0] via */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"/* bjmod[0:15][zmm0] >= sinwt ?             Opmask K2 is bit-flipped-analog of AVX-mode bitmask stored in RDX */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%zmm4	\n\t	vmovaps	 0x40(%%rax),%%zmm20	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x30(%%rbx),%%zmm5	\n\t	vmovaps	-0x70(%%rbx),%%zmm21	\n\t"/* wtB[j-1]; load doubles from rcx+[-0x30,-0x28,-0x20,-0x18,-0x10,-0x08, 0, +0x08] - It may not look like it but this is in fact an aligned load */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t	vpermq	%%zmm21,%%zmm3,%%zmm21	\n\t"/* d[0-7],[8-15] -> d[7-0],[15-8] */\
		/* AVX-512 LOACC wtsinit_X16 put wtl/wtn in [half_arr + 64-67]: */\
	/* In AVX-512 version, [wtl|wtn|wtlp1|wtnm1]-quartet addresses incr by 0x80 between ABCD-blocks, then reset to 0x1020 a start of E-block: */\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"/* Upper halves of above-computed 16-bit opmasks, used for rcol operands */\
		/* Put these results into rcol-registers since ensuing MULs overwrite those last: */\
		"vbroadcastsd 0x1000(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"/* one_half[m0-15] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"/* one_half[n0-15] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"/* wt    *= one_half[m0-15] */\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"/* wtinv *= one_half[...+n0-15] */\
		/* Results go into scratch storage = [half_arr + 0-63] in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x000(%%rdi)	\n\t	vmovaps %%zmm17,0x040(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x080(%%rdi)	\n\t	vmovaps %%zmm18,0x0c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw] ,%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vmovaps	(%%rax),%%zmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,NM1 REMAIN IN zmm6,7. */\
		"vmovaps	(%%rbx),%%zmm7		\n\t"\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"/* bjmod[0:15] += bw  */\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"/* bjmod[0:15] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%zmm1	\n\t"\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%zmm1	\n\t"\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 15 of 16 sets of carries */\
		"vmovaps	-0x30(%%rsi),%%zmm5	\n\t	vmovaps	-0x70(%%rsi),%%zmm21	\n\t"/* wtC[j-1]; as with wtB, this is an aligned-address 'in disguise' */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t	vpermq	%%zmm21,%%zmm3,%%zmm21	\n\t"/* d[0-7],[8-15] -> d[7-0],[15-8] */\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1010(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"/* one_half[m0-15] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"/* one_half[n0-15] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"/* wtinv=wtC*wtn */\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"/* wt    *= one_half[m0-15] */\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"/* wtinv *= one_half[...+n0-15] */\
		"vmovaps 	%%zmm1,0x100(%%rdi)	\n\t	vmovaps %%zmm17,0x140(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x180(%%rdi)	\n\t	vmovaps %%zmm18,0x1c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x200(%%rdi)	\n\t	vmovaps %%zmm17,0x240(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x280(%%rdi)	\n\t	vmovaps %%zmm18,0x2c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x300(%%rdi)	\n\t	vmovaps %%zmm17,0x340(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x380(%%rdi)	\n\t	vmovaps %%zmm18,0x3c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1080(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1088(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x400(%%rdi)	\n\t	vmovaps %%zmm17,0x440(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x480(%%rdi)	\n\t	vmovaps %%zmm18,0x4c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1090(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1098(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x500(%%rdi)	\n\t	vmovaps %%zmm17,0x540(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x580(%%rdi)	\n\t	vmovaps %%zmm18,0x5c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10c0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10c8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x600(%%rdi)	\n\t	vmovaps %%zmm17,0x640(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x680(%%rdi)	\n\t	vmovaps %%zmm18,0x6c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10d0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10d8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x700(%%rdi)	\n\t	vmovaps %%zmm17,0x740(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x780(%%rdi)	\n\t	vmovaps %%zmm18,0x7c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x800(%%rdi)	\n\t	vmovaps %%zmm17,0x840(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x880(%%rdi)	\n\t	vmovaps %%zmm18,0x8c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x900(%%rdi)	\n\t	vmovaps %%zmm17,0x940(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x980(%%rdi)	\n\t	vmovaps %%zmm18,0x9c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xa00(%%rdi)	\n\t	vmovaps %%zmm17,0xa40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xa80(%%rdi)	\n\t	vmovaps %%zmm18,0xac0(%%rdi)	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xb00(%%rdi)	\n\t	vmovaps %%zmm17,0xb40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xb80(%%rdi)	\n\t	vmovaps %%zmm18,0xbc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10a0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10a8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xc00(%%rdi)	\n\t	vmovaps %%zmm17,0xc40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xc80(%%rdi)	\n\t	vmovaps %%zmm18,0xcc0(%%rdi)	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10b0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10b8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xd00(%%rdi)	\n\t	vmovaps %%zmm17,0xd40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xd80(%%rdi)	\n\t	vmovaps %%zmm18,0xdc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10e0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10e8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xe00(%%rdi)	\n\t	vmovaps %%zmm17,0xe40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xe80(%%rdi)	\n\t	vmovaps %%zmm18,0xec0(%%rdi)	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpandd		%%zmm7,%%zmm0,%%zmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10f0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10f8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xf00(%%rdi)	\n\t	vmovaps %%zmm17,0xf40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xf80(%%rdi)	\n\t	vmovaps %%zmm18,0xfc0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:15] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm17","xmm18","xmm20","xmm21","xmm24","xmm25","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	// As with the _X4|8 wtsinit macros in AVX build mode, this AVX-512 8-way init only populates half the slots
	// in the chunk of local-mem used to hold the outputs, to ensure mem-layout compatibility with its 16-way
	// counterpart above, which uses all the mem-slots:
	#define AVX_cmplx_carry_fast_pow2_wtsinit_X8(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
		/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
		Since we gets .5,.25 via bitfield-load-as-double, those use VPBROADCASTQ; rest use VBROADCASTSD-from-mem-address:
		[1] Fwd-wt multipliers: Init = 0.50 x 8, anytime AVX-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 8, anytime AVX-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"movq $0x3FE0000000000000,%%rsi	\n\t	vpbroadcastq  %%rsi ,%%zmm30\n\t"\
		"movq $0x3FD0000000000000,%%rdi	\n\t	vpbroadcastq  %%rdi ,%%zmm31\n\t"\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 8 doubles d0-7 in zmm */\
		"movq $0x0001020304050607,%%rsi	\n\t"/* 64-bit register w/byte offsets 7-0, bytes ordered left-to-right in decreasing significance */\
		"vmovq		%%rsi,%%xmm3 		\n\t"/* Copy byte pattern to low qword (64 bits) of ymm3 [NB: AVX-512 only supports MOVQ to/from 128-bit vector regs] */\
		"vpmovzxbq	%%xmm3,%%zmm3		\n\t"/* vector-permutation-index: zmm3 = [7,6,5,4,3,2,1,0] in qwords.
																zmm3 PERSISTENT FROM HERE TIL END OF MACRO */\
		"movq	%[__half_arr],%%rdi		\n\t"\
	/**********************************/\
	/* Do A.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"/* Init zmm8,9 in prep for conditional-doubling */\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%ymm0		\n\t"/* bjmod[0:7]. PERSISTENT COPY OF BJMOD[0:7] REMAINS IN ymm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%ymm1	\n\t"/* Broadcast n_minus_sil to all 8 slots of ymm1 */\
	/*** Compares must be at full width even though only use lower half, due to AVX512F compliance requirement" ***/\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"/* n_minus_sil[ymm1] >= bjmod[0:7][ymm0] ? Opmask K1 is bit-flipped-analog of AVX-mode bitmask stored in RCX */\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%ymm1	\n\t"/* Broadcast sinwt to all 8 slots of ymm2 */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"/* bjmod[0:7][ymm0] >= sinwt ?              Opmask K2 is bit-flipped-analog of AVX-mode bitmask stored in RDX */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%zmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x30(%%rbx),%%zmm5	\n\t"/* wtB[j-1]; load doubles from rcx+[-0x30,-0x28,-0x20,-0x18,-0x10,-0x08, 0, +0x08] - It may not look like it but this is in fact an aligned load */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t"/* d[0-7] -> d[7-0] */\
		/* AVX-512 LOACC wtsinit put wtl/wtn in [half_arr + 64-67]: */\
	/* In AVX-512 version, [wtl|wtn|wtlp1|wtnm1]-quartet addresses incr by 0x40 between ABCD-blocks, then reset to 0x1020 a start of E-block: */\
		"vbroadcastsd 0x1000(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"/* one_half[m0-7] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"/* one_half[n0-7] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"/* wt    *= one_half[m0-7] */\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"/* wtinv *= one_half[...+n0-7] */\
		/* Results go into scratch storage = [half_arr + 0-63] - only half of said slots used by this 8-way routine - in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x000(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x080(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw] ,%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vmovaps	(%%rax),%%ymm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,NM1 REMAIN IN ymm6,7. */\
		"vmovaps	(%%rbx),%%ymm7		\n\t"\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%ymm1	\n\t"\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%ymm1	\n\t"\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x30(%%rsi),%%zmm5	\n\t"/* wtC[j-1]; as with wtB, this is an aligned-address 'in disguise' */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t"/* d[0-7] -> d[7-0] */\
		"vbroadcastsd 0x1010(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"/* one_half[m0-7] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"/* one_half[n0-7] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"/* wtinv=wtC*wtn */\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"/* wt    *= one_half[m0-7] */\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"/* wtinv *= one_half[...+n0-7] */\
		/* Results go into scratch storage = [half_arr + 0-31] in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x100(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x180(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x200(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x280(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x300(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x380(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1080(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1088(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x400(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x480(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1090(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1098(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x500(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x580(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10c0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10c8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x600(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x680(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10d0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10d8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x700(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x780(%%rdi)	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x800(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x880(%%rdi)	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x900(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x980(%%rdi)	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xa00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xa80(%%rdi)	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xb00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xb80(%%rdi)	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10a0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10a8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xc00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xc80(%%rdi)	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10b0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10b8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xd00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xd80(%%rdi)	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10e0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10e8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xe00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xe80(%%rdi)	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpand		%%ymm7,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10f0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10f8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xf00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xf80(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:7] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	// 16-fold analog of AVX_cmplx_carry_fast_wtsinit_X8:
	#define AVX_cmplx_carry_fast_wtsinit_X16(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
		/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
		Since we gets .5,.25 via bitfield-load-as-double, those use VPBROADCASTQ; rest use VBROADCASTSD-from-mem-address:
		[1] Fwd-wt multipliers: Init = 0.50 x 8, anytime AVX-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 8, anytime AVX-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"movq $0x3FE0000000000000,%%rsi	\n\t	vpbroadcastq  %%rsi ,%%zmm30\n\t"\
		"movq $0x3FD0000000000000,%%rdi	\n\t	vpbroadcastq  %%rdi ,%%zmm31\n\t"\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 8 doubles d0-7 in zmm */\
		"movq $0x0001020304050607,%%rsi	\n\t"/* 64-bit register w/byte offsets 7-0, bytes ordered left-to-right in decreasing significance */\
		"vmovq		%%rsi,%%xmm3 		\n\t"/* Copy byte pattern to low qword (64 bits) of zmm3 [NB: AVX-512 only supports MOVQ to/from 128-bit vector regs] */\
		"vpmovzxbq	%%xmm3,%%zmm3		\n\t"/* vector-permutation-index: zmm3 = [7,6,5,4,3,2,1,0] in qwords.
																zmm3 PERSISTENT FROM HERE TIL END OF MACRO */\
		"movq	%[__half_arr],%%rdi		\n\t"\
	/**********************************/\
	/* Do A.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"/* Init zmm8,9,24,25 in prep for conditional-doubling */\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%zmm0		\n\t"/* bjmod[0:15]. PERSISTENT COPY OF BJMOD[0:15] REMAINS IN zmm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%zmm1	\n\t"/* Broadcast n_minus_sil to all 16 slots of zmm1 */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"/* n_minus_sil[zmm1] >= bjmod[0:15][zmm0] ? Opmask K1 is bit-flipped-analog of AVX-mode bitmask stored in RCX */\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%zmm1	\n\t"/* Broadcast sinwt to all 16 slots of zmm1, then effect sinwt < bjmod[0:15][zmm0] via */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"/* bjmod[0:15][zmm0] >= sinwt ?             Opmask K2 is bit-flipped-analog of AVX-mode bitmask stored in RDX */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%zmm4	\n\t	vmovaps	 0x40(%%rax),%%zmm20	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x30(%%rbx),%%zmm5	\n\t	vmovaps	-0x70(%%rbx),%%zmm21	\n\t"/* wtB[j-1]; load doubles from rcx+[-0x30,-0x28,-0x20,-0x18,-0x10,-0x08, 0, +0x08] - It may not look like it but this is in fact an aligned load */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t	vpermq	%%zmm21,%%zmm3,%%zmm21	\n\t"/* d[0-7],[8-15] -> d[7-0],[15-8] */\
		/* AVX-512 LOACC wtsinit_X16 put wtl/wtn in [half_arr + 64-67]: */\
	/* In AVX-512 version, [wtl|wtn|wtlp1|wtnm1]-quartet addresses incr by 0x80 between ABCD-blocks, then reset to 0x1020 a start of E-block: */\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"/* Upper halves of above-computed 16-bit opmasks, used for rcol operands */\
		/* Put these results into rcol-registers since ensuing MULs overwrite those last: */\
		"vbroadcastsd 0x1000(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"/* one_half[m0-15] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"/* one_half[n0-15] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"/* wt    *= one_half[m0-15] */\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"/* wtinv *= one_half[...+n0-15] */\
		/* Results go into scratch storage = [half_arr + 0-63] in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x000(%%rdi)	\n\t	vmovaps %%zmm17,0x040(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x080(%%rdi)	\n\t	vmovaps %%zmm18,0x0c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw] ,%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_n],%%rbx		\n\t"\
		"vmovaps	(%%rax),%%zmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,NM1 REMAIN IN zmm6,7. */\
		"vmovaps	(%%rbx),%%zmm7		\n\t"\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"/* bjmod[0:15] += bw */\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"/* if(n > bjmod[0:15]) corr. bit in k1 set */\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"/* if(n > bjmod[0:15]) bjmod[0:15] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%zmm1	\n\t"\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%zmm1	\n\t"\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 15 of 16 sets of carries */\
		"vmovaps	-0x30(%%rsi),%%zmm5	\n\t	vmovaps	-0x70(%%rsi),%%zmm21	\n\t"/* wtC[j-1]; as with wtB, this is an aligned-address 'in disguise' */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t	vpermq	%%zmm21,%%zmm3,%%zmm21	\n\t"/* d[0-7],[8-15] -> d[7-0],[15-8] */\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1010(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"/* one_half[m0-15] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"/* one_half[n0-15] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"/* wtinv=wtC*wtn */\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"/* wt    *= one_half[m0-15] */\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"/* wtinv *= one_half[...+n0-15] */\
		"vmovaps 	%%zmm1,0x100(%%rdi)	\n\t	vmovaps %%zmm17,0x140(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x180(%%rdi)	\n\t	vmovaps %%zmm18,0x1c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x200(%%rdi)	\n\t	vmovaps %%zmm17,0x240(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x280(%%rdi)	\n\t	vmovaps %%zmm18,0x2c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%zmm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x300(%%rdi)	\n\t	vmovaps %%zmm17,0x340(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x380(%%rdi)	\n\t	vmovaps %%zmm18,0x3c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1080(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1088(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x400(%%rdi)	\n\t	vmovaps %%zmm17,0x440(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x480(%%rdi)	\n\t	vmovaps %%zmm18,0x4c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%zmm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1090(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1098(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x500(%%rdi)	\n\t	vmovaps %%zmm17,0x540(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x580(%%rdi)	\n\t	vmovaps %%zmm18,0x5c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10c0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10c8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x600(%%rdi)	\n\t	vmovaps %%zmm17,0x640(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x680(%%rdi)	\n\t	vmovaps %%zmm18,0x6c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%zmm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10d0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10d8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x700(%%rdi)	\n\t	vmovaps %%zmm17,0x740(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x780(%%rdi)	\n\t	vmovaps %%zmm18,0x7c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x800(%%rdi)	\n\t	vmovaps %%zmm17,0x840(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x880(%%rdi)	\n\t	vmovaps %%zmm18,0x8c0(%%rdi)	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%zmm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0x900(%%rdi)	\n\t	vmovaps %%zmm17,0x940(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x980(%%rdi)	\n\t	vmovaps %%zmm18,0x9c0(%%rdi)	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xa00(%%rdi)	\n\t	vmovaps %%zmm17,0xa40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xa80(%%rdi)	\n\t	vmovaps %%zmm18,0xac0(%%rdi)	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%zmm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xb00(%%rdi)	\n\t	vmovaps %%zmm17,0xb40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xb80(%%rdi)	\n\t	vmovaps %%zmm18,0xbc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10a0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10a8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xc00(%%rdi)	\n\t	vmovaps %%zmm17,0xc40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xc80(%%rdi)	\n\t	vmovaps %%zmm18,0xcc0(%%rdi)	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%zmm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10b0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10b8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xd00(%%rdi)	\n\t	vmovaps %%zmm17,0xd40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xd80(%%rdi)	\n\t	vmovaps %%zmm18,0xdc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10e0(%%rdi),%%zmm17	\n\t"/* wtl */\
		"vbroadcastsd 0x10e8(%%rdi),%%zmm18	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xe00(%%rdi)	\n\t	vmovaps %%zmm17,0xe40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xe80(%%rdi)	\n\t	vmovaps %%zmm18,0xec0(%%rdi)	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vpcmpgtd	%%zmm7,%%zmm0,%%k1		\n\t"\
		"vpsubd	%%zmm7,%%zmm0,%%zmm0%{%%k1%}\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm30,%%zmm24\n\t"\
		"vmovaps	     %%zmm31,%%zmm9	\n\t	vmovaps	     %%zmm31,%%zmm25\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%zmm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3		\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vbroadcastsd 0x10f0(%%rdi),%%zmm17	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10f8(%%rdi),%%zmm18	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t	vaddpd	%%zmm30,%%zmm30,%%zmm24%{%%k3%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t	vaddpd	%%zmm31,%%zmm31,%%zmm25%{%%k4%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm17,%%zmm1	\n\t	vmulpd	%%zmm20,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm5,%%zmm18,%%zmm2	\n\t	vmulpd	%%zmm21,%%zmm18,%%zmm18	\n\t"\
		"vmulpd	%%zmm8,%%zmm1 ,%%zmm1	\n\t	vmulpd	%%zmm24,%%zmm17,%%zmm17	\n\t"\
		"vmulpd	%%zmm9,%%zmm2 ,%%zmm2	\n\t	vmulpd	%%zmm25,%%zmm18,%%zmm18	\n\t"\
		"vmovaps 	%%zmm1,0xf00(%%rdi)	\n\t	vmovaps %%zmm17,0xf40(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xf80(%%rdi)	\n\t	vmovaps %%zmm18,0xfc0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:15] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm17","xmm18","xmm20","xmm21","xmm24","xmm25","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	// Non-power-of-2 analog of AVX_cmplx_carry_fast_pow2_wtsinit_X8 - Differs from above pow2 version only in how we do the modding in the bjmodn += bw (mod n) step:
	#define AVX_cmplx_carry_fast_wtsinit_X8(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
		/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
		Since we gets .5,.25 via bitfield-load-as-double, those use VPBROADCASTQ; rest use VBROADCASTSD-from-mem-address:
		[1] Fwd-wt multipliers: Init = 0.50 x 8, anytime AVX-style lookup into 1st mini-table would have bit = 0, double the corr. datum
		[2] Inv-wt multipliers: Init = 0.25 x 8, anytime AVX-style lookup into 2nd mini-table would have bit = 0, double the corr. datum
		*/\
		"movq $0x3FE0000000000000,%%rsi	\n\t	vpbroadcastq  %%rsi ,%%zmm30\n\t"\
		"movq $0x3FD0000000000000,%%rdi	\n\t	vpbroadcastq  %%rdi ,%%zmm31\n\t"\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 8 doubles d0-7 in zmm */\
		"movq $0x0001020304050607,%%rsi	\n\t"/* 64-bit register w/byte offsets 7-0, bytes ordered left-to-right in decreasing significance */\
		"vmovq		%%rsi,%%xmm3 		\n\t"/* Copy byte pattern to low qword (64 bits) of ymm3 [NB: AVX-512 only supports MOVQ to/from 128-bit vector regs] */\
		"vpmovzxbq	%%xmm3,%%zmm3		\n\t"/* vector-permutation-index: zmm3 = [7,6,5,4,3,2,1,0] in qwords.
																zmm3 PERSISTENT FROM HERE TIL END OF MACRO */\
		"movq	%[__half_arr],%%rdi		\n\t"\
	/**********************************/\
	/* Do A.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"/* Init zmm8,9 in prep for conditional-doubling */\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%ymm0		\n\t"/* bjmod[0:7]. PERSISTENT COPY OF BJMOD[0:7] REMAINS IN ymm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%ymm1	\n\t"/* Broadcast n_minus_sil to all 8 slots of ymm1 */\
	/*** Compares must be at full width even though only use lower half, due to AVX512F compliance requirement" ***/\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"/* n_minus_sil[ymm1] >= bjmod[0:7][ymm0] ? Opmask K1 is bit-flipped-analog of AVX-mode bitmask stored in RCX */\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%ymm1	\n\t"/* Broadcast sinwt to all 8 slots of ymm2 */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"/* bjmod[0:7][ymm0] >= sinwt ?             Opmask K2 is bit-flipped-analog of AVX-mode bitmask stored in RDX */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovups	     (%%rax),%%zmm4	\n\t"/* wtA[j  ]; for RADIX == 0 (mod 8) this is an aligned load, but need this macro to work also for RADIX == 4 (mod 8) cases, so use VMOVUPS here and for wtB,C[j-1] loads below and further down */\
		"vmovups	-0x30(%%rbx),%%zmm5	\n\t"/* wtB[j-1]; load doubles from rcx+[-0x30,-0x28,-0x20,-0x18,-0x10,-0x08, 0, +0x08] */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t"/* d[0-7] -> d[7-0] */\
		/* AVX-512 LOACC wtsinit put wtl/wtn in [half_arr + 64-67]: */\
	/* In AVX-512 version, [wtl|wtn|wtlp1|wtnm1]-quartet addresses incr by 0x40 between ABCD-blocks, then reset to 0x1020 a start of E-block: */\
		"vbroadcastsd 0x1000(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"/* one_half[m0-7] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"/* one_half[n0-7] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"/* wt    *= one_half[m0-7] */\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"/* wtinv *= one_half[...+n0-7] */\
		/* Results go into scratch storage = [half_arr + 0-63] - only half of said slots used by this 8-way routine - in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x000(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x080(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw],%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_n] ,%%rbx		\n\t"\
		"vmovaps	(%%rax),%%ymm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,N REMAIN IN ymm6,7. */\
		"vmovaps	(%%rbx),%%ymm7		\n\t"\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"/* bjmod[0:7] += bw */\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"/* if(n > bjmod[0:7]) ymm1 = 11...11 */\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"/* if(n > bjmod[0:7]) ymm1 = n; otherwise 0 */\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"/* if(n > bjmod[0:7]) bjmod[0:7] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x0(%%rcx),%%ymm1	\n\t"\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
	"vpbroadcastd	0x0(%%rdx),%%ymm1	\n\t"\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovups	-0x30(%%rsi),%%zmm5	\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] */\
		"vpermq	%%zmm5,%%zmm3,%%zmm5	\n\t"/* d[0-7] -> d[7-0] */\
		"vbroadcastsd 0x1010(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"/* one_half[m0-7] multiplier for wt    */\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"/* one_half[n0-7] multiplier for wtinv */\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"/* wtinv=wtC*wtn */\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"/* wt    *= one_half[m0-7] */\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"/* wtinv *= one_half[...+n0-7] */\
		/* Results go into scratch storage = [half_arr + 0-31] in AVX-512 mode: */\
		"vmovaps 	%%zmm1,0x100(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x180(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x200(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x280(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x4(%%rcx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x4(%%rdx),%%ymm1	\n\t"/* .d1 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x300(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x380(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1080(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1088(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x400(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x480(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x8(%%rcx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x8(%%rdx),%%ymm1	\n\t"/* .d2 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1090(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1098(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x500(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x580(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10c0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10c8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x600(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x680(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0xc(%%rcx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0xc(%%rdx),%%ymm1	\n\t"/* .d3 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10d0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10d8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x700(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x780(%%rdi)	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x800(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x880(%%rdi)	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x10(%%rcx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x10(%%rdx),%%ymm1	\n\t"/* .d4 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0x900(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0x980(%%rdi)	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xa00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xa80(%%rdi)	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x14(%%rcx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x14(%%rdx),%%ymm1	\n\t"/* .d5 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xb00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xb80(%%rdi)	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10a0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10a8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xc00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xc80(%%rdi)	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x18(%%rcx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x18(%%rdx),%%ymm1	\n\t"/* .d6 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10b0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10b8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xd00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xd80(%%rdi)	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10e0(%%rdi),%%zmm1	\n\t"/* wtl */\
		"vbroadcastsd 0x10e8(%%rdi),%%zmm2	\n\t"/* wtn */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xe00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xe80(%%rdi)	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vpcmpgtd	%%ymm7,%%ymm0,%%ymm1	\n\t"\
		"vpand		%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vpsubd		%%ymm1,%%ymm0,%%ymm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet:                 */\
	/**********************************/\
		"vmovaps	     %%zmm30,%%zmm8	\n\t	vmovaps	     %%zmm31,%%zmm9	\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
	"vpbroadcastd	0x1c(%%rcx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm0,%%zmm1,%%k1	\n\t"\
		"movq	%[__sinwtm1],%%rdx		\n\t"\
	"vpbroadcastd	0x1c(%%rdx),%%ymm1	\n\t"/* .d7 term of index octet */\
		"vpcmpd	$5,%%zmm1,%%zmm0,%%k2	\n\t"\
		"vbroadcastsd 0x10f0(%%rdi),%%zmm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x10f8(%%rdi),%%zmm2	\n\t"/* wtnm1 */\
	"vaddpd	%%zmm30,%%zmm30,%%zmm8%{%%k1%}	\n\t"\
	"vaddpd	%%zmm31,%%zmm31,%%zmm9%{%%k2%}	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm8,%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm9,%%zmm2,%%zmm2	\n\t"\
		"vmovaps 	%%zmm1,0xf00(%%rdi)	\n\t"\
		"vmovaps 	%%zmm2,0xf80(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:7] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX)	// non-FMA-using versions of the 8-way and 4-way macros def'd in common for for AVX and AVX2:

	// AVX macro to do the 4 x 4 cmplx_carry_fast_pow2_wtsinit() scalar-double macro init calls in 4-way parallel mode.
	// This is essentially the weights-computation portion of the AVX_cmplx_carry_norm_pow2_errcheck_X4 macro,
	// with the computed weights and their inverses overwriting the input wtl,n data in local memory; the latter data's
	// addresses are fiddled w.r.to their value in the aforementioned carry-macros in order to match those of the outputs
	// of the scalar-double macro sequence.
	#define AVX_cmplx_carry_fast_pow2_wtsinit_X4(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
		"movq	%[__half_arr],%%rdi		\n\t"/* half_arr + 16*[0,1,2,3,4,5] = [wt,wt_inv,base,baseinv,wts_mult,inv_mult] */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%xmm0		\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"/* n_minus_sil in low 32 bits of xmm1 */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"/* Broadcast low 32 bits of xmm1 to all 4 slots of xmm1 */\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm1,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"/* sinwt in low 32 bits of xmm2*/\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"/* Broadcast low 32 bits of xmm2to all 4 slots of xmm2*/\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"/* xmm3 = bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm3,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x10(%%rbx),%%ymm5	\n\t"/* wtB[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm5*/\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* LOACC wtsinit put wtl/wtn in [half_arr + 128-131] (64 slots higher than HIACC) because [half_arr + 64-127] used for wts_mult, inv_mult tables: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"vmovaps 	%%ymm1,0xc00(%%rdi)	\n\t"\
		"vmovaps 	%%ymm2,0xc40(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vmovaps	(%%rax),%%xmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,NM1 REMAIN IN xmm6,7. */\
		"vmovaps	(%%rbx),%%xmm7		\n\t"\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"/* bjmod[0:3] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"/* bjmod[0:3] &= nm1; & doesn't care whether integer [pand] or floating [andpd], but data are int, so use pand for form's sake */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rsi),%%ymm5	\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"vmovaps	%%ymm1,0xc80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xcc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xd00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xd40(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xd80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xdc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xe00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xe40(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xe80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xec0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xf00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xf40(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xf80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xfc0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"	/* Clobbered registers */\
	);\
	}

	// 8-fold analog of AVX_cmplx_carry_fast_pow2_wtsinit_X4:
	#define AVX_cmplx_carry_fast_pow2_wtsinit_X8(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
		"movq	%[__half_arr],%%rdi		\n\t"/* half_arr + 16*[0,1,2,3,4,5] = [wt,wt_inv,base,baseinv,wts_mult,inv_mult] */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
		"movq	%[__bjmod_0],%%rax		\n\t"/* bjmod[0:3][4:7]. PERSISTENT COPIES REMAIN IN xmm0,8. */\
		"vmovaps	(%%rax),%%xmm0		\n\t	vmovaps	0x10(%%rax),%%xmm8	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"/* n_minus_sil in low 32 bits of xmm1 */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"/* 4-way broadcast of lo32 bits */\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"/* n_minus_sil - bjmod[0:3][4:7] */\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"/* Extract sign bits. */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"/* sinwt in low 32 bits of xmm2*/\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"/* 4-way broadcast of lo32 bits ... this gets subbed *from* bjmodn, so no need for regsiter-copy */\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"/* xmm3 = bjmod[0:3][4:7] - sinwt */\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"/* Extract sign bits. */\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm4	\n\t	vmovaps	 0x20(%%rax),%%ymm12\n\t"/* wtA[j  ]; */\
		"vmovaps	-0x10(%%rbx),%%ymm5	\n\t	vmovaps	-0x30(%%rbx),%%ymm13\n\t"/* wtB[j-1] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm5*/\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		/* LOACC wtsinit put wtl/wtn in [half_arr + 128-131] (64 slots higher than HIACC) because [half_arr + 64-127] used for wts_mult, inv_mult tables: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps 	%%ymm1,0xc00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xc20(%%rdi)	\n\t"\
		"vmovaps 	%%ymm2,0xc40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xc60(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vmovaps	(%%rax),%%xmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,NM1 REMAIN IN xmm6,7. */\
		"vmovaps	(%%rbx),%%xmm7		\n\t"\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"/* bjmod[0:3][4:7] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"/* bjmod[0:3][4:7] &= nm1; & doesn't care whether integer [pand] or floating [andpd], but data are int, so use pand for form's sake */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rsi),%%ymm5	\n\t	vmovaps	-0x30(%%rsi),%%ymm13\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xc80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xca0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xcc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xce0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xd00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xd20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xd40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xd60(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xd80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xda0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xdc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xde0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xe00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xe20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xe40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xe60(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xe80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xea0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xec0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xee0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xf00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xf20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xf40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xf60(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpand		%%xmm7,%%xmm0,%%xmm0	\n\t	vpand		%%xmm7,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xf80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xfa0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xfc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xfe0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13"	/* Clobbered registers */\
	);\
	}

	// AVX macro to do the 4 x 4 cmplx_carry_fast_wtsinit() scalar-double macro init calls in 4-way parallel mode.
	// This is essentially the weights-computation portion of the AVX_cmplx_carry_norm_errcheck_X4 macro,
	// with the computed weights and their inverses overwriting the input wtl,n data in local memory; the latter data's
	// addresses are fiddled w.r.to their value in the aforementioned carry-macros in order to match those of the outputs
	// of the scalar-double macro sequence.
	// Only diff between this and its power-of-2 cousin is mechanics of the bjmodn + bw (mod n) step at end of each of
	// the 8 data-processing blocks.
	//
	#define AVX_cmplx_carry_fast_wtsinit_X4(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
		"movq	%[__half_arr],%%rdi		\n\t"/* half_arr + 16*[0,1,2,3,4,5] = [wt,wt_inv,base,baseinv,wts_mult,inv_mult] */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
		"movq	%[__bjmod_0],%%rax		\n\t"\
		"vmovaps	(%%rax),%%xmm0		\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"/* n_minus_sil in low 32 bits of xmm1 */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"/* Broadcast low 32 bits of xmm1 to all 4 slots of xmm1 */\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm1,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"/* sinwt in low 32 bits of xmm2*/\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"/* Broadcast low 32 bits of xmm2to all 4 slots of xmm2*/\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"/* xmm3 = bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm3,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x10(%%rbx),%%ymm5	\n\t"/* wtB[j-1] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm5*/\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* LOACC wtsinit put wtl/wtn in [half_arr + 128-131] (64 slots higher than HIACC) because [half_arr + 64-127] used for wts_mult, inv_mult tables: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"vmovaps 	%%ymm1,0xc00(%%rdi)	\n\t"\
		"vmovaps 	%%ymm2,0xc40(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw],%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_n] ,%%rbx		\n\t"\
		"vmovaps	(%%rax),%%xmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,N REMAIN IN xmm6,7. */\
		"vmovaps	(%%rbx),%%xmm7		\n\t"\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"/* bjmod[0:3] += bw */\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"/* if(n > bjmod[0:3]) xmm1 = n; otherwise 0 */\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rsi),%%ymm5	\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"vmovaps	%%ymm1,0xc80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xcc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xd00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xd40(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xd80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xdc0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xe00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xe40(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xe80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xec0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm1	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm2	\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xf00(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xf40(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm1	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm2	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2\n\t"\
		/* Results go into even-index slots: */\
		"vmovaps	%%ymm1,0xf80(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xfc0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"	/* Clobbered registers */\
	);\
	}

	// 8-fold analog of AVX_cmplx_carry_fast_wtsinit_X4:
	#define AVX_cmplx_carry_fast_wtsinit_X8(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
		"movq	%[__half_arr],%%rdi		\n\t"/* half_arr + 16*[0,1,2,3,4,5] = [wt,wt_inv,base,baseinv,wts_mult,inv_mult] */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
		"movq	%[__bjmod_0],%%rax		\n\t"/* bjmod[0:3][4:7]. PERSISTENT COPIES REMAIN IN xmm0,8. */\
		"vmovaps	(%%rax),%%xmm0		\n\t	vmovaps	0x10(%%rax),%%xmm8	\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"/* n_minus_sil in low 32 bits of xmm1 */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"/* 4-way broadcast of lo32 bits */\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"/* n_minus_sil - bjmod[0:3][4:7] */\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"/* Extract sign bits. */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"/* sinwt in low 32 bits of xmm2*/\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"/* 4-way broadcast of lo32 bits ... this gets subbed *from* bjmodn, so no need for regsiter-copy */\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"/* xmm3 = bjmod[0:3][4:7] - sinwt */\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"/* Extract sign bits. */\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm4	\n\t	vmovaps	 0x20(%%rax),%%ymm12\n\t"/* wtA[j  ]; */\
		"vmovaps	-0x10(%%rbx),%%ymm5	\n\t	vmovaps	-0x30(%%rbx),%%ymm13\n\t"/* wtB[j-1] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm5*/\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		/* LOACC wtsinit put wtl/wtn in [half_arr + 128-131] (64 slots higher than HIACC) because [half_arr + 64-127] used for wts_mult, inv_mult tables: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps 	%%ymm1,0xc00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xc20(%%rdi)	\n\t"\
		"vmovaps 	%%ymm2,0xc40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xc60(%%rdi)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw],%%rax		\n\t"/* After initial loads, rax,rbx dedicated to bw,nm1 data */\
		"movq	%[__sse_n] ,%%rbx		\n\t"\
		"vmovaps	(%%rax),%%xmm6		\n\t"/* PERSISTENT COPIES OF SSE_BW,N REMAIN IN xmm6,7. */\
		"vmovaps	(%%rbx),%%xmm7		\n\t"\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"/* bjmod[0:3][4:7] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"/* if(n > bjmod[0:3][4:7]) xmm1 = 11...11 */\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"/* if(n > bjmod[0:3][4:7]) xmm1 = n; otherwise 0 */\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"/* if(n > bjmod[0:3][4:7]) bjmod[0:3][4:7] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x0(%%rcx),%%xmm1	\n\t"\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x0(%%rdx),%%xmm2	\n\t"\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"movq	%[__wtC]	,%%rsi		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rsi),%%ymm5	\n\t	vmovaps	-0x30(%%rsi),%%ymm13\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm5,%%ymm5,%%ymm5	\n\t vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xc80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xca0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xcc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xce0(%%rdi)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xd00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xd20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xd40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xd60(%%rdi)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x4(%%rcx),%%xmm1	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x4(%%rdx),%%xmm2	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xd80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xda0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xdc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xde0(%%rdi)	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xe00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xe20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xe40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xe60(%%rdi)	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0x8(%%rcx),%%xmm1	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0x8(%%rdx),%%xmm2	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xe80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xea0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xec0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xee0(%%rdi)	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
		"movq %[__n_minus_sil],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtn */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xf00(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xf20(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xf40(%%rdi)		\n\t	vmovaps 	%%ymm10,0xf60(%%rdi)	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		%%xmm6,%%xmm0,%%xmm0	\n\t	vpaddd		%%xmm6,%%xmm8,%%xmm8	\n\t"\
		"vpcmpgtd	%%xmm7,%%xmm0,%%xmm1	\n\t	vpcmpgtd	%%xmm7,%%xmm8,%%xmm9	\n\t"\
		"vpand		%%xmm7,%%xmm1,%%xmm1	\n\t	vpand		%%xmm7,%%xmm9,%%xmm9	\n\t"\
		"vpsubd		%%xmm1,%%xmm0,%%xmm0	\n\t	vpsubd		%%xmm9,%%xmm8,%%xmm8	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq %[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd		0xC(%%rcx),%%xmm1	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm1,%%xmm1	\n\t	vmovaps	%%xmm1,%%xmm9		\n\t"\
		"vpsubd	%%xmm0,%%xmm1,%%xmm1	\n\t	vpsubd	%%xmm8,%%xmm9,%%xmm9\n\t"\
		"vmovmskps	%%xmm1,%%rcx		\n\t	vmovmskps	%%xmm9,%%r8 	\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx	\n\t"\
		"vmovd		0xC(%%rdx),%%xmm2	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,%%xmm2,%%xmm2	\n\t"\
		"vpsubd	%%xmm2,%%xmm0,%%xmm3	\n\t vpsubd	%%xmm2,%%xmm8,%%xmm11	\n\t"\
		"vmovmskps	%%xmm3,%%rdx		\n\t	vmovmskps	%%xmm11,%%r9 	\n\t"\
		"\n\t"\
	"shlq	$5,%%rcx	\n\t	shlq	$5,%%r8	\n\t"\
	"shlq	$5,%%rdx	\n\t	shlq	$5,%%r9	\n\t"\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t	addq	%%rdi,%%r8	\n\t"\
		"addq	%%rdi,%%rdx		\n\t	addq	%%rdi,%%r9	\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm1	\n\t	vmovaps	%%ymm1,%%ymm9			\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm2	\n\t	vmovaps	%%ymm2,%%ymm10			\n\t"/* wtnm1 */\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vmulpd	%%ymm12,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm13,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	     (%%rcx),%%ymm1,%%ymm1	\n\t vmulpd	     (%%r8),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x200(%%rdx),%%ymm2,%%ymm2	\n\t vmulpd	0x200(%%r9),%%ymm10,%%ymm10	\n\t"\
		/* lcol/rcol results go into even/odd-index slots, resp.: */\
		"vmovaps	%%ymm1,0xf80(%%rdi)		\n\t	vmovaps 	%%ymm9 ,0xfa0(%%rdi)	\n\t"\
		"vmovaps	%%ymm2,0xfc0(%%rdi)		\n\t	vmovaps 	%%ymm10,0xfe0(%%rdi)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13"	/* Clobbered registers */\
	);\
	}

#endif

#ifdef USE_AVX512

	// 16-fold analog of AVX_cmplx_carry_fast_pow2_errcheck_X8 - my tests on KNL showed it's faster to let the system do the prefetching:
	#define AVX_cmplx_carry_fast_pow2_errcheck_X16(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 8 prefetches-from-main-data-array spread through this macro */\
	/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
	[3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum
	[4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum
	[5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum
	[6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum
	*/\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0-f. Outputs into r0-f: */\
		"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
		"kmovw	%%eax,%%k1			\n\t"\
		/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
		could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
		"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
		"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
			"vmovq		%%rax,%%xmm0 		\n\t"\
			"vmovq		%%rbx,%%xmm1 		\n\t"\
			"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
			"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
	/********************************************************************/\
	/* Second 8-way transpose. Inputs from r10-1f. Outputs into r10-1f: */\
	/********************************************************************/\
		"addq		$0x400,%%rax		\n\t"\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		"subq		$0x400,%%rax			\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%zmm20	\n\t"/* prp_mult, broadcast to all double-slots of zmm20 */\
	/********** RCOL REGISTERS NUMBERED += 16 W.R.TO ANALOGOUS LCOL-REGISTERS **********/\
		"movq	%[__cy],%%rbx				\n\t"\
		"vmovaps	    (%%rbx),%%zmm1		\n\t	vmovaps	0x40(%%rbx),%%zmm17		\n\t"/* zmm1,17 = Our 2 eight-double cy_in[0:7][8:15]-vectors */\
		/* LOACC wts-data occupy 32 zmm-sized slots starting at (vec_dbl*)half_arr + 0 : */\
		"movq	%[__half_arr],%%rdi			\n\t	vmovaps -0x80(%%rdi),%%zmm2		\n\t"/* zmm2 = maxerr */\
		/* In AVX-512 mode, the 4 doubles base[0],baseinv[1],wts_mult[1],inv_mult[0] are in d0-3 slots
		of otherwise-unused sse2_rnd vec_dbl, that is in -0x40(rdi) + 0x[0,8,10,18]:: */\
		"vbroadcastsd -0x40(%%rdi),%%zmm10	\n\t"\
		"vbroadcastsd -0x38(%%rdi),%%zmm11	\n\t"\
		"vbroadcastsd -0x30(%%rdi),%%zmm12	\n\t"\
		"vbroadcastsd -0x28(%%rdi),%%zmm13	\n\t	vaddpd %%zmm13,%%zmm13,%%zmm14	\n\t"/* zmm13,14 have inv_mult[0] (needed for conditional-doubling), inv_mult[1] (needed for (wt_re >= inv_mult[1]) comparison) */\
		"movq	%[__sse_nm1],%%rbx			\n\t	vmovaps	(%%rbx),%%zmm15			\n\t"/* PERSISTENT COPY OF SSE_N  REMAINS IN zmm15. */\
		"movq	%[__sse_bw] ,%%rbx			\n\t"/* RBX HOLDS ADDRESS OF SSE_BW */\
		"movq	%[__bjmod_0],%%rsi			\n\t	vmovaps	(%%rsi),%%zmm3			\n\t"/* bjmod[0:15], PERSISTENT COPY IN zmm3 */\
		"movq	%[__sse_sw] ,%%rsi			\n\t	vmovaps	(%%rsi),%%zmm4			\n\t"/* sw,  16-fold PERSISTENT COPY IN zmm4 */\
		"movq	%[__sign_mask],%%rsi		\n\t	vmovaps %%zmm2,%%zmm18			\n\t"/* Rcol-copy of maxerr, allowing both cols to do independent updates with just one merge at end */\
	/**********************************/\
	/* Do A.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x000(%%rax),%%zmm0		\n\t	vmovaps	0x400(%%rax),%%zmm16	\n\t"/* Load data */\
		"vmovaps	0x080(%%rdi),%%zmm6		\n\t	vmovaps	0x0c0(%%rdi),%%zmm22	\n\t"/* wi */\
		"vmovaps	0x000(%%rdi),%%zmm5		\n\t	vmovaps	0x040(%%rdi),%%zmm21	\n\t"/* wt for our 16 independent carry-chains */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"/* x *= wtinv */\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t"/* Set bit in k1 if sw < bjmod[0:7] ; Opmask K1 is analog of AVX-mode bitmask stored in R10 */\
	"kmovw	%[__i],%%k2						\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing K1 (whose */\
	"kxorw	%%k2,%%k1,%%k1					\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"knotw	%%k1,%%k2						\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"/* Upper halves of above-computed 16-bit opmasks, used for rcol operands */\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"/* [3] Fwd-base mults */\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"/* [4] Inv-base mults */\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"/* x - temp */\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"/* cpy temp */\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"/* temp*baseinv */\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"/* K1,3 = (wt >= inv_mult[1]) [Do compare as (inv_mult[1] < wt)] */\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"/* K2,4 = inverse-masks */\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"/* [5] [LOACC] wts_mult */\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"/* [6] [LOACC] inv_mult */\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM0~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"/* bjmod[0:15] += bw */\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"/* bjmod[0:15] &= nm1 */\
		"vmovaps	%%zmm0,     (%%rax) 	\n\t	vmovaps	%%zmm16,0x400(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x000(%%rdi)		\n\t	vmovaps	%%zmm21,0x040(%%rdi)	\n\t"/* Store wt */\
		"vmovaps	%%zmm6,0x080(%%rdi)		\n\t	vmovaps	%%zmm22,0x0c0(%%rdi)	\n\t"/* Store wi */\
		"\n\t"\
	/**********************************/\
	/* Do A.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x040(%%rax),%%zmm0		\n\t	vmovaps	0x440(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x180(%%rdi),%%zmm6		\n\t	vmovaps	0x1c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x100(%%rdi),%%zmm5		\n\t	vmovaps	0x140(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x040(%%rax) 	\n\t	vmovaps	%%zmm16,0x440(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x100(%%rdi)		\n\t	vmovaps	%%zmm21,0x140(%%rdi)	\n\t"/* Store wt_im */\
		"vmovaps	%%zmm6,0x180(%%rdi)		\n\t	vmovaps	%%zmm22,0x1c0(%%rdi)	\n\t"/* Store wi_im */\
		"\n\t"\
	/**********************************/\
	/* Do B.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x080(%%rax),%%zmm0		\n\t	vmovaps	0x480(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x280(%%rdi),%%zmm6		\n\t	vmovaps	0x2c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x200(%%rdi),%%zmm5		\n\t	vmovaps	0x240(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x080(%%rax) 	\n\t	vmovaps	%%zmm16,0x480(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x200(%%rdi)		\n\t	vmovaps	%%zmm21,0x240(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x280(%%rdi)		\n\t	vmovaps	%%zmm22,0x2c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x0c0(%%rax),%%zmm0		\n\t	vmovaps	0x4c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x380(%%rdi),%%zmm6		\n\t	vmovaps	0x3c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x300(%%rdi),%%zmm5		\n\t	vmovaps	0x340(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x0c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x4c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x300(%%rdi)		\n\t	vmovaps	%%zmm21,0x340(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x380(%%rdi)		\n\t	vmovaps	%%zmm22,0x3c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x100(%%rax),%%zmm0		\n\t	vmovaps	0x500(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x480(%%rdi),%%zmm6		\n\t	vmovaps	0x4c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x400(%%rdi),%%zmm5		\n\t	vmovaps	0x440(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x100(%%rax) 	\n\t	vmovaps	%%zmm16,0x500(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x400(%%rdi)		\n\t	vmovaps	%%zmm21,0x440(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x480(%%rdi)		\n\t	vmovaps	%%zmm22,0x4c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x140(%%rax),%%zmm0		\n\t	vmovaps	0x540(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x580(%%rdi),%%zmm6		\n\t	vmovaps	0x5c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x500(%%rdi),%%zmm5		\n\t	vmovaps	0x540(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x140(%%rax) 	\n\t	vmovaps	%%zmm16,0x540(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x500(%%rdi)		\n\t	vmovaps	%%zmm21,0x540(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x580(%%rdi)		\n\t	vmovaps	%%zmm22,0x5c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x180(%%rax),%%zmm0		\n\t	vmovaps	0x580(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x680(%%rdi),%%zmm6		\n\t	vmovaps	0x6c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x600(%%rdi),%%zmm5		\n\t	vmovaps	0x640(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x180(%%rax) 	\n\t	vmovaps	%%zmm16,0x580(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x600(%%rdi)		\n\t	vmovaps	%%zmm21,0x640(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x680(%%rdi)		\n\t	vmovaps	%%zmm22,0x6c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x1c0(%%rax),%%zmm0		\n\t	vmovaps	0x5c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x780(%%rdi),%%zmm6		\n\t	vmovaps	0x7c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x700(%%rdi),%%zmm5		\n\t	vmovaps	0x740(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x1c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x5c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x700(%%rdi)		\n\t	vmovaps	%%zmm21,0x740(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x780(%%rdi)		\n\t	vmovaps	%%zmm22,0x7c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x200(%%rax),%%zmm0		\n\t	vmovaps	0x600(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x880(%%rdi),%%zmm6		\n\t	vmovaps	0x8c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x800(%%rdi),%%zmm5		\n\t	vmovaps	0x840(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x200(%%rax) 	\n\t	vmovaps	%%zmm16,0x600(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x800(%%rdi)		\n\t	vmovaps	%%zmm21,0x840(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x880(%%rdi)		\n\t	vmovaps	%%zmm22,0x8c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x240(%%rax),%%zmm0		\n\t	vmovaps	0x640(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x980(%%rdi),%%zmm6		\n\t	vmovaps	0x9c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x900(%%rdi),%%zmm5		\n\t	vmovaps	0x940(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x240(%%rax) 	\n\t	vmovaps	%%zmm16,0x640(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x900(%%rdi)		\n\t	vmovaps	%%zmm21,0x940(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x980(%%rdi)		\n\t	vmovaps	%%zmm22,0x9c0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x280(%%rax),%%zmm0		\n\t	vmovaps	0x680(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xa80(%%rdi),%%zmm6		\n\t	vmovaps	0xac0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xa00(%%rdi),%%zmm5		\n\t	vmovaps	0xa40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x280(%%rax) 	\n\t	vmovaps	%%zmm16,0x680(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xa00(%%rdi)		\n\t	vmovaps	%%zmm21,0xa40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xa80(%%rdi)		\n\t	vmovaps	%%zmm22,0xac0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x2c0(%%rax),%%zmm0		\n\t	vmovaps	0x6c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xb80(%%rdi),%%zmm6		\n\t	vmovaps	0xbc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xb00(%%rdi),%%zmm5		\n\t	vmovaps	0xb40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x2c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x6c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xb00(%%rdi)		\n\t	vmovaps	%%zmm21,0xb40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xb80(%%rdi)		\n\t	vmovaps	%%zmm22,0xbc0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x300(%%rax),%%zmm0		\n\t	vmovaps	0x700(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xc80(%%rdi),%%zmm6		\n\t	vmovaps	0xcc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xc00(%%rdi),%%zmm5		\n\t	vmovaps	0xc40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x300(%%rax) 	\n\t	vmovaps	%%zmm16,0x700(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xc00(%%rdi)		\n\t	vmovaps	%%zmm21,0xc40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xc80(%%rdi)		\n\t	vmovaps	%%zmm22,0xcc0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x340(%%rax),%%zmm0		\n\t	vmovaps	0x740(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xd80(%%rdi),%%zmm6		\n\t	vmovaps	0xdc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xd00(%%rdi),%%zmm5		\n\t	vmovaps	0xd40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x340(%%rax) 	\n\t	vmovaps	%%zmm16,0x740(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xd00(%%rdi)		\n\t	vmovaps	%%zmm21,0xd40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xd80(%%rdi)		\n\t	vmovaps	%%zmm22,0xdc0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x380(%%rax),%%zmm0		\n\t	vmovaps	0x780(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xe80(%%rdi),%%zmm6		\n\t	vmovaps	0xec0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xe00(%%rdi),%%zmm5		\n\t	vmovaps	0xe40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x380(%%rax) 	\n\t	vmovaps	%%zmm16,0x780(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xe00(%%rdi)		\n\t	vmovaps	%%zmm21,0xe40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xe80(%%rdi)		\n\t	vmovaps	%%zmm22,0xec0(%%rdi)	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im 16-tet: Data in m0,16: */\
	/**********************************/\
		"vmovaps	0x3c0(%%rax),%%zmm0		\n\t	vmovaps	0x7c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xf80(%%rdi),%%zmm6		\n\t	vmovaps	0xfc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xf00(%%rdi),%%zmm5		\n\t	vmovaps	0xf40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vmovaps	%%zmm12,%%zmm23			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k4%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t	vmulpd	%%zmm23,%%zmm21,%%zmm21	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		/* Get ready for next set [RE8~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpandd		%%zmm15,%%zmm3,%%zmm3	\n\t"\
		"vmovaps	%%zmm0,0x3c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x7c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xf00(%%rdi)		\n\t	vmovaps	%%zmm21,0xf40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xf80(%%rdi)		\n\t	vmovaps	%%zmm22,0xfc0(%%rdi)	\n\t"\
		"\n\t"\
		/* Store the bjmodn[0:15] index 16-tet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	vmovaps	%%zmm3,(%%rbx)	\n\t"\
		/* Store our pair of cy_out octets-of-doubles: */\
		"movq	%[__cy],%%rbx				\n\t"\
		"vmovaps		%%zmm1,(%%rbx)		\n\t	vmovaps	%%zmm17,0x40(%%rbx)		\n\t"\
		/* Store maxerr, after merging the separate lcol,rcol maxerr results: */\
		"vmaxpd	%%zmm2,%%zmm18,%%zmm2		\n\t	vmovaps	%%zmm2,-0x80(%%rdi)		\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately): Inputs from r0-f. Outputs into r0-f: */	\
		"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
		"kmovw	%%eax,%%k1			\n\t"\
		/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
		could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
		"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
		"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
			"vmovq		%%rax,%%xmm0 		\n\t"\
			"vmovq		%%rbx,%%xmm1 		\n\t"\
			"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
			"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
	/********************************************************************/\
	/* Second 8-way transpose. Inputs from r10-1f. Outputs into r10-1f: */\
	/********************************************************************/\
		"addq		$0x400,%%rax		\n\t"\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm20","xmm21","xmm22","xmm23","xmm24","xmm25","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	#define AVX_cmplx_carry_fast_pow2_errcheck_X8(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 8 prefetches-from-main-data-array spread through this macro */\
	/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
	[3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum
	[4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum
	[5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum
	[6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum
	*/\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0-f. Outputs into r0-f: */\
		"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
		"kmovw	%%eax,%%k1			\n\t"\
		/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
		could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
		"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
		"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
			"vmovq		%%rax,%%xmm0 		\n\t"\
			"vmovq		%%rbx,%%xmm1 		\n\t"\
			"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
			"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%zmm20	\n\t"/* prp_mult, broadcast to all double-slots of zmm20 */\
		"movq		%[__cy],%%rbx			\n\t	vmovaps	     (%%rbx),%%zmm1	\n\t"/* zmm1 = Our eight-double cy_in */\
		/* LOACC wts-data occupy 32 zmm-sized slots starting at (vec_dbl*)half_arr + 0 : */\
		"movq	%[__half_arr],%%rdi			\n\t	vmovaps -0x80(%%rdi),%%zmm2	\n\t"/* zmm2 = maxerr */\
		/* In AVX-512 mode, the 4 doubles base[0],baseinv[1],wts_mult[1],inv_mult[0] are in d0-3 slots
		of otherwise-unused sse2_rnd vec_dbl, that is in -0x40(rdi) + 0x[0,8,10,18]:: */\
		"vbroadcastsd -0x40(%%rdi),%%zmm10	\n\t"\
		"vbroadcastsd -0x38(%%rdi),%%zmm11	\n\t"\
		"vbroadcastsd -0x30(%%rdi),%%zmm12	\n\t"\
		"vbroadcastsd -0x28(%%rdi),%%zmm13	\n\t	vaddpd %%zmm13,%%zmm13,%%zmm14	\n\t"/* ymm13,14 have inv_mult[0] (needed for conditional-doubling), inv_mult[1] (needed for (wt_re >= inv_mult[1]) comparison) */\
		"movq	%[__sse_nm1],%%rbx			\n\t	vmovaps	(%%rbx),%%ymm15			\n\t"/* PERSISTENT COPY OF SSE_NM1 REMAINS IN ymm15. */\
		"movq	%[__sse_bw] ,%%rbx			\n\t"/* RBX HOLDS ADDRESS OF SSE_BW */\
		"movq	%[__bjmod_0],%%rsi			\n\t	vmovaps	(%%rsi),%%ymm3			\n\t"/* bjmod[0:7], PERSISTENT COPY IN ymm3 */\
		"movq	%[__sse_sw] ,%%rsi			\n\t	vmovaps	(%%rsi),%%ymm4			\n\t"/* sw,  8-fold PERSISTENT COPY IN ymm4 */\
		"movq	%[__sign_mask],%%rsi		\n\t"\
	/**********************************/\
	/* Do A.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
		"vmovaps	0x000(%%rax),%%zmm0		\n\t"\
		"vmovaps	0x080(%%rdi),%%zmm6		\n\t"/* wi_re - 8-way carry macro only uses every other one of these local-mem-slots */\
		"vmovaps	0x000(%%rdi),%%zmm5		\n\t"/* wt_re for our 8 independent carry-chains */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"/* x *= wtinv */\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t"/* Set bit in k1 if sw < bjmod[0:7] ; Opmask K1 is analog of AVX-mode bitmask stored in R10 */\
	"kmovw	%[__i],%%k2						\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing K1 (whose */\
	"kxorw	%%k2,%%k1,%%k1					\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"knotw	%%k1,%%k2						\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"/* [3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum: */\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"/* [4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum: */\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"/* x - temp */\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"/* cpy temp */\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"/* temp*baseinv */\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"/* K1 = (wt_re >= inv_mult[1]) [Do compare as (inv_mult[1] < wt_re)]; K2 = inverse-mask */\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"/* [5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum: */\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"/* [6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum: */\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"/* wi_re *= inv_mult[i] */\
		/* Get ready for next set [IM0~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"/* bjmod[0:7] += bw */\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"/* bjmod[0:7] += bw (mod n) */\
		"vmovaps	%%zmm0,     (%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x000(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%zmm6,0x080(%%rdi)		\n\t"/* Store wi_re */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x040(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x180(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x100(%%rdi),%%zmm5		\n\t"/* wt_im for our 8 independent carry-chains */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"/* wi_im *= inv_mult[i] */\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x040(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x100(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%zmm6,0x180(%%rdi)		\n\t"/* Store wi_im */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x080(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x280(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x200(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x080(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x200(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x280(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x0c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x380(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x300(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x0c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x300(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x380(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x100(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x480(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x400(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x100(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x400(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x480(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x140(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x580(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x500(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x140(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x500(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x580(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x180(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x680(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x600(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x180(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x600(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x680(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x1c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x780(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x700(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x1c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x700(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x780(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
		"vmovaps	0x200(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x880(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x800(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x200(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x800(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x880(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x240(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x980(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x900(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x240(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x900(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x980(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x280(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xa80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xa00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x280(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xa00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xa80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x2c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xb80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xb00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x2c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xb00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xb80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x300(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xc80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xc00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x300(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xc00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xc80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x340(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xd80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xd00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x340(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xd00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xd80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x380(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xe80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xe00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x380(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xe00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xe80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x3c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xf80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xf00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE8~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpand		%%ymm15,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x3c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xf00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xf80(%%rdi)		\n\t"\
		"\n\t"\
		/* Store the bjmodn[0:7] index octet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	vmovaps	%%ymm3,(%%rbx)	\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy] ,%%rbx			\n\t	vmovaps	%%zmm1,(%%rbx)	\n\t"\
		/* Store maxerr: */\
		"vmovaps	%%zmm2,-0x80(%%rdi)		\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately): Inputs from r0-f. Outputs into r0-f: */	\
		"movl	$0b00110011,%%eax	\n\t"/* Carry step has overwritten k1, re-init for transpose */\
		"kmovw	%%eax,%%k1			\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Outputs are now ordered - write 'em back to memory: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX2)	// FMA-using versions of the 8-way and 4-way macros def'd for AVX:

  #ifdef GCC_5PLUS

	// gcc 4.x !support the needed AVX2 8-way int instructions (while still being fine for for the floating-FMA
	// used for the FFT), so require an added compile-time define to enable this version of 8-way. Based on my
	// Broadwell timings this is no faster than the version using half-width 128-bit arithmetic for the integer
	// math attendant to Mersenne-mod IBDWT, so did not create a non-pow2-FFT-length analog, and left calls to
	// this macro around just in the radix-16 carry routine I used for my comparative timing tests.
	//
	#define AVX2_cmplx_carry_fast_pow2_errcheck_X8(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		/* And un-fiddle the base address: */\
		"subq	$0x100,%%rax				\n\t"\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	(%%rbx),%%ymm12			\n\t	vmovaps	(%%rcx),%%ymm13	\n\t"/* ymm12,13 = Our pair of four-double cy_ins */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 64 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi			\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14		\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"vmovaps	     (%%rax),%%ymm0 	\n\t	vmovaps	0x100(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__bjmod_0],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%ymm15			\n\t"/* bjmod[0:7], persistent copy in ymm15 */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%ymm10			\n\t"/* sw[0:7] */\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"/* sw[0:7] - bjmod[0:7] */\
	"vmovmskps	%%ymm10,%%rsi			\n\t"/* Extract sign bits into 8-bit signmask, idxs into base/inv table */\
	"movslq	%[__i]	,%%rbx				\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%rsi				\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	/* lcol = <0:3> << 5, rcol = <4:7> << 5 (shift = 5 to give ptr offset for ymm-size data: */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t	vmovaps	0xc60(%%rdi),%%ymm5 		\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t	vmovaps	0xc20(%%rdi),%%ymm3 		\n\t"/* wt_re */\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"/* Extract cmp-results into 8-bit signmask */\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,     (%%rax)	\n\t	vmovaps	%%ymm1 ,0x100(%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xc20(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xc60(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x020(%%rax),%%ymm0 	\n\t	vmovaps	0x120(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"/* sw[0:3] */\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xce0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t	vmovaps	0xca0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x020(%%rax)	\n\t	vmovaps	%%ymm1 ,0x120(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xca0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xce0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x040(%%rax),%%ymm0 	\n\t	vmovaps	0x140(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t	vmovaps	0xd60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t	vmovaps	0xd20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x040(%%rax)	\n\t	vmovaps	%%ymm1 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xd20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xd60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x060(%%rax),%%ymm0 	\n\t	vmovaps	0x160(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xde0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t	vmovaps	0xda0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x060(%%rax)	\n\t	vmovaps	%%ymm1 ,0x160(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xda0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xde0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
	"vmovaps	0x080(%%rax),%%ymm0 	\n\t	vmovaps	0x180(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t	vmovaps	0xe60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t	vmovaps	0xe20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)	\n\t	vmovaps	%%ymm1 ,0x180(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xe20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xe60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0a0(%%rax),%%ymm0 	\n\t	vmovaps	0x1a0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t	vmovaps	0xee0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t	vmovaps	0xea0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0a0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1a0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xea0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xee0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t	vmovaps	0x1c0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t	vmovaps	0xf60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t	vmovaps	0xf20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0c0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xf20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xf60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm0 	\n\t	vmovaps	0x1e0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),	%%ymm10		\n\t"\
	"vpsubd		%%ymm15,%%ymm10,%%ymm10	\n\t"\
	"vmovmskps	%%ymm10,	%%rsi		\n\t"\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	"movq	%%rsi,	%%r10				\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $1,	%%rsi			\n\t"\
	"andq	$0x1e0,	%%r10				\n\t	andq	$0x1e0,	%%rsi			\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xfe0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t	vmovaps	0xfa0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0e0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1e0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xfa0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xfe0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd		(%%rbx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] += bw  */\
		"vpand		(%%rcx),%%ymm15,%%ymm15	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
		/* Store bjmodn index octet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t"\
		"vmovaps	%%ymm15,(%%rbx)			\n\t"\
		/* Store cy_outs: */\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)			\n\t	vmovaps	%%ymm13,(%%rcx)	\n\t"/* ymm12,13 = Our pair of four-double cy_outs */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

  #else

	// 8-way version of the AVX 4-way carry macro,
	// analogous to the 128-bit-setting 4-way SSE2_cmplx_carry_fast_pow2_errcheck macro:
	#define AVX_cmplx_carry_fast_pow2_errcheck_X8(Xdata,XcyA,XcyB,Xbjmod_0,Xbjmod_4,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		/* And un-fiddle the base address: */\
		"subq	$0x100,%%rax				\n\t"\
	/*** mm6-9 *FREE* between here and closing un-transpose block ... each processing ***/\
	/*** column below uses 5 vector registers, making it tempting to add a 3rd column ***/\
	/*** In this version of the carry macro use 1 of the free vec-regs for bjmod[4:7] ***/\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	(%%rbx),%%ymm12			\n\t	vmovaps	(%%rcx),%%ymm13	\n\t"/* ymm12,13 = Our pair of four-double cy_ins */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi			\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14		\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"vmovaps	     (%%rax),%%ymm0 	\n\t	vmovaps	0x100(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__bjmod_0],%%rsi			\n\t	movq	%[__bjmod_4],%%rcx	\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t	vmovaps	(%%rcx),%%xmm6		\n\t"/* bjmod[0:3] and [4:7], persistent copies in xmm15,xmm6, resp. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"movslq	%[__i]	,%%rbx				\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%r10				\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	/* lcol = <0:3> << 5, rcol = <4:7> << 5 (shift = 5 to give ptr offset for ymm-size data: */\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t	vmovaps	0xc60(%%rdi),%%ymm5 		\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t	vmovaps	0xc20(%%rdi),%%ymm3 		\n\t"/* wt_re */\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"/* Extract cmp-results into 8-bit signmask */\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,     (%%rax)	\n\t	vmovaps	%%ymm1 ,0x100(%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xc20(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xc60(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x020(%%rax),%%ymm0 	\n\t	vmovaps	0x120(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xce0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t	vmovaps	0xca0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x020(%%rax)	\n\t	vmovaps	%%ymm1 ,0x120(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xca0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xce0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x040(%%rax),%%ymm0 	\n\t	vmovaps	0x140(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t	vmovaps	0xd60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t	vmovaps	0xd20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x040(%%rax)	\n\t	vmovaps	%%ymm1 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xd20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xd60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x060(%%rax),%%ymm0 	\n\t	vmovaps	0x160(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xde0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t	vmovaps	0xda0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x060(%%rax)	\n\t	vmovaps	%%ymm1 ,0x160(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xda0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xde0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
	"vmovaps	0x080(%%rax),%%ymm0 	\n\t	vmovaps	0x180(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t	vmovaps	0xe60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t	vmovaps	0xe20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)	\n\t	vmovaps	%%ymm1 ,0x180(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xe20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xe60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0a0(%%rax),%%ymm0 	\n\t	vmovaps	0x1a0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t	vmovaps	0xee0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t	vmovaps	0xea0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0a0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1a0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xea0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xee0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t	vmovaps	0x1c0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t	vmovaps	0xf60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t	vmovaps	0xf20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0c0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xf20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xf60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm0 	\n\t	vmovaps	0x1e0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xfe0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t	vmovaps	0xfa0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4,%%ymm0,%%ymm0	\n\t	vmulpd	%%ymm5,%%ymm1,%%ymm1		\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0e0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1e0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xfa0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xfe0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
		/* Store the two bjmodn index quartets: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	movq	%[__bjmod_4],%%rcx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t	vmovaps	%%xmm6 ,(%%rcx)			\n\t"\
		/* Store cy_outs: */\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)			\n\t	vmovaps	%%ymm13,(%%rcx)	\n\t"/* ymm12,13 = Our pair of four-double cy_outs */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__bjmod_4]	"m" (Xbjmod_4)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

   #endif	// ifdef GCC_5PLUS ?

	// Register-name choices in the non-transpose middle section reflect preparation for an 8-way version of this macro:
	#define AVX_cmplx_carry_fast_pow2_errcheck_X4(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm1-7 back to memory to free up vector registers: */\
		"												vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		"movq		%[__cy],%%rbx		\n\t"\
		"vmovaps	(%%rbx),%%ymm12		\n\t"/* ymm12 = cy_in */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14	\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"movq	%[__bjmod_0],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm15. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm10			\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rbx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		/* This 4-way vector-carry macro uses only the even-indexed 16 of said slots, having byte offsets == 0 (mod 0x40) */\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] &= nm1; & doesn't care whether integer [pand] or floating [andpd], but data are int, so use pand for form's sake */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x20(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi		\n\t"\
	"vmovaps	(%%rsi),	%%xmm10		\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x20(%%rax)		\n\t"/* Store A.im to free up a register */\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x40(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x40(%%rax)		\n\t"/* Store B.re to free up a register */\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x60(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x60(%%rax)		\n\t"/* Store B.im to free up a register */\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE2~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x80(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t"/* Store C.re to free up a register */\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM2~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xa0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xa0(%%rax)		\n\t"/* Store C.im to free up a register */\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE3~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xc0(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xc0(%%rax)		\n\t"/* Store D.re to free up a register */\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM3~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xe0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xe0(%%rax)		\n\t"/* Store D.im to free up a register */\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rbx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)	\n\t"/* cy_out = ymm12 */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm12","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX)

	// 8-way version of the AVX 4-way carry macro,
	// analogous to the 128-bit-setting 4-way SSE2_cmplx_carry_fast_pow2_errcheck macro:
	#define AVX_cmplx_carry_fast_pow2_errcheck_X8(Xdata,XcyA,XcyB,Xbjmod_0,Xbjmod_4,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		/* And un-fiddle the base address: */\
		"subq	$0x100,%%rax				\n\t"\
	/*** mm6-9 *FREE* between here and closing un-transpose block ... each processing ***/\
	/*** column below uses 5 vector registers, making it tempting to add a 3rd column ***/\
	/*** In this version of the carry macro use 1 of the free vec-regs for bjmod[4:7] ***/\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	(%%rbx),%%ymm12			\n\t	vmovaps	(%%rcx),%%ymm13	\n\t"/* ymm12,13 = Our pair of four-double cy_ins */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi			\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14		\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"vmovaps	     (%%rax),%%ymm0 	\n\t	vmovaps	0x100(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__bjmod_0],%%rsi			\n\t	movq	%[__bjmod_4],%%rcx	\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t	vmovaps	(%%rcx),%%xmm6		\n\t"/* bjmod[0:3] and [4:7], persistent copies in xmm15,xmm6, resp. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"movslq	%[__i]	,%%rbx				\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%r10				\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	/* lcol = <0:3> << 5, rcol = <4:7> << 5 (shift = 5 to give ptr offset for ymm-size data: */\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t	vmovaps	0xc60(%%rdi),%%ymm5 		\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t	vmovaps	0xc20(%%rdi),%%ymm3 		\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"/* cy_out */\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"/* cy*base */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"/* Extract cmp-results into 8-bit signmask */\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,     (%%rax)	\n\t	vmovaps	%%ymm1 ,0x100(%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xc20(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xc60(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x020(%%rax),%%ymm0 	\n\t	vmovaps	0x120(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xce0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t	vmovaps	0xca0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x020(%%rax)	\n\t	vmovaps	%%ymm1 ,0x120(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xca0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xce0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x040(%%rax),%%ymm0 	\n\t	vmovaps	0x140(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t	vmovaps	0xd60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t	vmovaps	0xd20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x040(%%rax)	\n\t	vmovaps	%%ymm1 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xd20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xd60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x060(%%rax),%%ymm0 	\n\t	vmovaps	0x160(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xde0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t	vmovaps	0xda0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x060(%%rax)	\n\t	vmovaps	%%ymm1 ,0x160(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xda0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xde0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
	"vmovaps	0x080(%%rax),%%ymm0 	\n\t	vmovaps	0x180(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t	vmovaps	0xe60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t	vmovaps	0xe20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)	\n\t	vmovaps	%%ymm1 ,0x180(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xe20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xe60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0a0(%%rax),%%ymm0 	\n\t	vmovaps	0x1a0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t	vmovaps	0xee0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t	vmovaps	0xea0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0a0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1a0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xea0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xee0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t	vmovaps	0x1c0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t	vmovaps	0xf60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t	vmovaps	0xf20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0c0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xf20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xf60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm0 	\n\t	vmovaps	0x1e0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xfe0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t	vmovaps	0xfa0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0e0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1e0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xfa0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xfe0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_bw]	,%%rbx			\n\t"\
		"movq	%[__sse_nm1],%%rcx			\n\t"\
		"vpaddd	(%%rbx),%%xmm15,%%xmm15		\n\t	vpaddd	(%%rbx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] += bw  */\
		"vpand	(%%rcx),%%xmm15,%%xmm15		\n\t	vpand	(%%rcx),%%xmm6,%%xmm6	\n\t"/* bjmod[0:7] &= nm1 */\
		"\n\t"\
		/* Store the two bjmodn index quartets: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	movq	%[__bjmod_4],%%rcx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t	vmovaps	%%xmm6 ,(%%rcx)			\n\t"\
		/* Store cy_outs: */\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)			\n\t	vmovaps	%%ymm13,(%%rcx)	\n\t"/* ymm12,13 = Our pair of four-double cy_outs */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__bjmod_4]	"m" (Xbjmod_4)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

	// Register-name choices in the non-transpose middle section reflect preparation for an 8-way version of this macro:
	#define AVX_cmplx_carry_fast_pow2_errcheck_X4(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm1-7 back to memory to free up vector registers: */\
		"												vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		"movq		%[__cy],%%rbx		\n\t"\
		"vmovaps	(%%rbx),%%ymm12		\n\t"/* ymm12 = cy_in */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14	\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"movq	%[__bjmod_0],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm15. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm10			\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rbx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		/* This 4-way vector-carry macro uses only the even-indexed 16 of said slots, having byte offsets == 0 (mod 0x40) */\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] ... store product in ymm10, since still need cy in ymm12 */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] &= nm1; & doesn't care whether integer [pand] or floating [andpd], but data are int, so use pand for form's sake */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x20(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi		\n\t"\
	"vmovaps	(%%rsi),	%%xmm10		\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x20(%%rax)		\n\t"/* Store A.im to free up a register */\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x40(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x40(%%rax)		\n\t"/* Store B.re to free up a register */\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x60(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x60(%%rax)		\n\t"/* Store B.im to free up a register */\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE2~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x80(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t"/* Store C.re to free up a register */\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM2~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xa0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xa0(%%rax)		\n\t"/* Store C.im to free up a register */\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE3~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xc0(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xc0(%%rax)		\n\t"/* Store D.re to free up a register */\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM3~] : */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xe0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xe0(%%rax)		\n\t"/* Store D.im to free up a register */\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_bw]	,%%rbx		\n\t"\
		"movq	%[__sse_nm1],%%rcx		\n\t"\
		"vpaddd		(%%rbx),%%xmm15,%%xmm15	\n\t"\
		"vpand		(%%rcx),%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rbx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)	\n\t"/* cy_out = ymm12 */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm12","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

#endif

// Remaining carry macros are generic AVX:

#ifdef USE_AVX

	/********* Names ending in _X4 denote "Genuine AVX" carry macros, which process 8 AVX-sized vector (= 32 doubles): **********/
	// Note that in this "true AVX" carry, [n_minus_sil,n_minus_silp1,sinwt,sinwtm1] are all pointers to (struct uint32x4) data, rather than 32bit ints as for SSE2-style macros!
	/*
	Aug 2013: The following "true AVX" carry macros - fusing the fancy-indexing footwork of the legacy SSE2 mersenne-mod-DWT
	carry macros and the AVX data-permute aspects of the AVX-based Fermat-mod carry macros - ran incredibly, awfully, unbelievably
	slowly in the initial implementation. I eventually traced that back to the mixing of legacy SSE instructions (using xmm-form registers)
	in the indexing-computation portions of the code with AVX instructions used for weights and carries in the new AVX code. The solution
	was to simply prepend a "v" to the legacy SSE instructions and (for ones where the VEX form of the instruction adds a third operand)
	to duplicate the original SRC+DEST operand (rightmost on this AT&T/GCC-syntax inline ASM) in order to satisfy the 3-operand syntax.

	CF. Intel's own "Mixing SSE and AVX bad, very bad" cautions in the following "Avoiding AVX-SSE Transition Penalties" PDF:

		http://software.intel.com/sites/default/files/m/d/4/1/d/8/11MC12_Avoiding_2BAVX-SSE_2BTransition_2BPenalties_2Brh_2Bfinal.pdf

	Here is the money snippet:

		"When using Intel® AVX instructions, it is important to know that mixing 256-bit Intel® AVX instructions
		 with legacy (non VEX-encoded) Intel® SSE instructions may result in penalties that could impact performance.
		 256-bit Intel® AVX instructions operate on the 256-bit ymm registers which are 256-bit extensions of the
		 existing 128-bit xmm registers. 128-bit Intel® AVX instructions operate on the lower 128 bits of the ymm
		 registers and zero the upper 128 bits. However, legacy Intel® SSE instructions operate on the xmm registers
		 and have no knowledge of the upper 128 bits of the ymm registers. Because of this, the hardware saves the
		 contents of the upper 128 bits of the ymm registers when transitioning from 256-bit Intel® AVX to legacy
		 Intel® SSE, and then restores these values when transitioning back from Intel® SSE to Intel® AVX (256-bit
		 or 128-bit). The save and restore operations both cause a penalty that amounts to several tens of clock
		 cycles for each operation."

	Cf. also Agner Fog's "early in the AVX life cycle" commentary at http://software.intel.com/en-us/forums/topic/301853

	To convey a sense of just how severe the timing penalties resulting from such SSE/AVX instruction mixing can be,
	here are sample timings for my AVX-enabled Mlucas code running at an FFT length of 4096 kdoubles on my 3.4 GHz quad-core
	Haswell system, using 2 threads, with a full-time 4-threaded Mlucas run [ongoing multimonth run of F28 at 15360 K] as background load:

	[1] The baseline timing here is set by the well-tested and quite fast pure-AVX Fermat-mod carry macros:
		time ./Mlucas -f26 -fftlen 4096 -iters 100 -radset 0 -nthread 2
		...
		100 iterations of F26 with FFT length 4194304 = 4096 K
		Res64: A42BECD80DAEC4CB. AvgMaxErr = 0.005018834. MaxErr = 0.005859375. Program: E3.0x
		real	0m5.298s
	*	user	0m5.172s

	[2] Now do Mersene-mod run at same FFT length, using AVX-based FFT-pass code but SSE2-based carry macros,
		tweaked to take account of the differing AVX data layout:
		gcc -c -O3 -DUSE_THREADS -DUSE_AVX radix32*cy*c && gcc -o Mlucas *.o -lm -lpthread
		time ./Mlucas -fftlen 4096 -iters 100 -radset 0 -nthread 2
		...
		100 iterations of M77597293 with FFT length 4194304 = 4096 K
		Res64: 8CC30E314BF3E556. AvgMaxErr = 0.293526786. MaxErr = 0.343750000. Program: E3.0x
		real	0m6.673s
	*	user	0m6.760s

	...which is ~35% slower than the Fermat-mod run at the same FFT length - not a complete and total disaster but still
	very bad compared to the expected 10-20% runtime hit here for the Mersenne-mod computation.
	We hope for better using true-AVX mode for the carry step. Alas, our initial tests are "beyond unpromising":

	[3] gcc -c -O3 -DUSE_THREADS -DUSE_AVX -DUSE_AVX_CARRIES radix32*cy*c && gcc -o Mlucas *.o -lm -lpthread
		time ./Mlucas -fftlen 4096 -iters 100 -radset 0 -nthread 2
		...
		real	0m13.521s
	*	user	0m12.857s	<*** Same FFT code + AVX-based carries more than doubles the overall runtime!!! ***

	[4] Somehow I had failed to come across any of the literature discussing the SSE/AVX mixed-code performance
	penalty in my AVX-related reading. Once I realized that the key difference in instruction mix between the Fermat-mod
	and Mersenne-mod "true AVX" carry code was the presence of non-AVX SSE instructions in the latter, I quickly found
	the above docs detailing the performance hit such code entails, fixed the offending macros and immediately saw rather
	more promising timings, to say the least:

		gcc -c -O3 -DUSE_THREADS -DUSE_AVX -DUSE_AVX_CARRIES radix32*cy*c && gcc -o Mlucas *.o -lm -lpthread
		time ./Mlucas -fftlen 4096 -iters 100 -radset 0 -nthread 2
		...
		real	0m6.287s
	*	user	0m5.952s	<*** More than 2x faster than [3]!

	I.e. the performance *penalty* component alone from using the mixed SSE/AVX carry macros here was greater
	than the entire *runtime* needed for either the "pure-AVX-FFT/pure-SSE-carry" hybrid in [2] or the "true AVX 4 all"
	code, which only suffers a ~15% runtime penalty versus the analogous Fermat-mod code.
	*/
	// Mar 2016: Bumped mem-offsets for broadcast-loads from 0x800-based to 0x1000-based for compatibility with new LOACC-carry memory layout
	#define AVX_cmplx_carry_norm_pow2_errcheck_X4(Xdata,XwtA,XwtB,XwtC,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Move ymm7 to mem to free up a vreg: */\
		"vmovaps	%%ymm7,0x0e0(%%rax)			\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm7	\n\t"/* prp_mult, broadcast to all double-slots of ymm7 */\
		/* Won't need main-array again until output transpose, so re-use rax for half_arr */\
		"movq	 %[__half_arr],%%rax	\n\t"\
		/* half_arr + 16*[0,1,2,3] = [wt,wt_inv,base,baseinv] */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	(%%rbx),%%ymm14	\n\t"/* ymm14 = cy_in */\
		"vmovaps	-0x40(%%rax),%%ymm15	\n\t"/* ymm15 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
		"movq	%[__bjmod_0],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm8 		\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm8. */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"/* sw[0:3] */\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rcx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rcx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x0(%%rcx)	,%%xmm9 	\n\t"/* n_minus_sil in low 32 bits of xmm9  */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"/* Broadcast low 32 bits of xmm9  to all 4 slots of xmm9  */\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x0(%%rdx)	,%%xmm10		\n\t"/* sinwt in low 32 bits of xmm10 */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"/* Broadcast low 32 bits of xmm10 to all 4 slots of xmm10 */\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"/* xmm11 = bjmod[0:3] copy */\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"/* bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm11,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm12 	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x10(%%rbx),%%ymm13	\n\t"/* wtB[j-1] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm13 */\
		"vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0		\n\t"/* x *= wtinv; ymm10 FREE */\
		"vmovaps	%%ymm0,%%ymm10		\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10	\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm0,%%ymm15	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm10,%%ymm14			\n\t"/* cy_out */\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm9 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"/* bjmod[0:3] += bw ; must use packed-INTEGER add [not addpd!] here, severe performance penalty from using addpd. */\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"/* bjmod[0:3] &= nm1; & doesn't care whether integer [pand] or floating [andpd], but data are int, so use pand for form's sake */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"/* sw[0:3] */\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x0(%%rcx)	,%%xmm9 	\n\t"/* n_minus_sil in low 32 bits of xmm9  */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"/* Broadcast low 32 bits of xmm9  to all 4 slots of xmm9  */\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x0(%%rdx)	,%%xmm10		\n\t"/* sinwt in low 32 bits of xmm10 */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"/* Broadcast low 32 bits of xmm10 to all 4 slots of xmm10 */\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"/* xmm11 = bjmod[0:3] copy */\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"/* bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm11,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtC]	,%%rbx		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rbx),%%ymm13	\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1		\n\t"/* x *= wtinv; ymm10 FREE */\
		"vmovaps	%%ymm1,%%ymm10		\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10	\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm1,%%ymm15	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm10,%%ymm14			\n\t"/* cy_out */\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm1 ,%%ymm1 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm9 ,%%ymm1 ,%%ymm1 		\n\t"/* x *= wt */\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x4(%%rcx)	,%%xmm9 	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x4(%%rdx)	,%%xmm10	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmovaps	%%ymm2,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vandpd		(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm2,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm2			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm2 ,%%ymm2 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm2 ,%%ymm2 	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x4(%%rcx)	,%%xmm9 	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x4(%%rdx)	,%%xmm10	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm3,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vandpd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm3,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm3			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm3 ,%%ymm3 	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x8(%%rcx)	,%%xmm9 	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x8(%%rdx)	,%%xmm10	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm4,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm4,%%ymm4	\n\t"\
		"vandpd		(%%rbx),%%ymm4,%%ymm4	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm4,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm4			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm4 ,%%ymm4 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm4 ,%%ymm4 	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x8(%%rcx)	,%%xmm9 	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x8(%%rdx)	,%%xmm10	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm5,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm5,%%ymm5	\n\t"\
		"vandpd		(%%rbx),%%ymm5,%%ymm5	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm5,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm5			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm5 ,%%ymm5 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm5 ,%%ymm5 	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0xC(%%rcx)	,%%xmm9 	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0xC(%%rdx)	,%%xmm10	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm6,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm6,%%ymm6	\n\t"\
		"vandpd		(%%rbx),%%ymm6,%%ymm6	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm6,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm6			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm6 ,%%ymm6 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm6 ,%%ymm6 	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0xC(%%rcx)	,%%xmm9 	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0xC(%%rdx)	,%%xmm10	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
	/* reload ymm7 from mem - this overwrites the prp_mult data, which is why we first copy the latter to just-fred ymm11: */\
	"movq		%[__data],%%rax			\n\t"\
	"vmovaps	%%ymm7,%%ymm11			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm7		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm7,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm7,%%ymm7	\n\t"\
		"vandpd		(%%rbx),%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm11,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult[now in ymm11] */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm7,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm7			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm7 ,%%ymm7 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm7 ,%%ymm7 	\n\t"\
		/* Update wts-array pointers in preparation for next call of the macro: */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"\
		"addq	$0x20	,%%rax			\n\t"/* add0 += 4 */\
		"subq	$0x20	,%%rbx			\n\t"/* add1 -= 4 */\
		"subq	$0x20	,%%rcx			\n\t"/* add2 -= 4 */\
		"movq	%%rax	,%[__wtA]		\n\t"\
		"movq	%%rbx	,%[__wtB]		\n\t"\
		"movq	%%rcx	,%[__wtC]		\n\t"\
		"\n\t"\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1],%%rbx		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vpand		(%%rbx),%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"vmovaps	%%xmm8,(%%rcx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm14,(%%rbx)	\n\t"/* ymm14 = cy_in */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm15,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}


	/***********************************************************************************************************/
	/*** Non-power-of-2-FFT version of above AVX_cmplx_carry_norm_pow2_errcheck_X4 Mersenne-mod carry macro: ***/
	/***********************************************************************************************************/

#ifdef USE_AVX512	// For AVX512, support only the fast [i.e. LOACC] Mers-carry macros:

	// 16-fold analog of AVX_cmplx_carry_fast_errcheck_X8 - early tests on KNL showed the prefetching done in this macro
	// makes very little if ny difference versus no-prefetch.
	// NOTE - for an earlier version of this macro with easier to follow instruction flow see v17, which suffers multiple wait-stalls
	// due to dependent high-latency ops (addpd/mulpd/fmapd/rndscalepd) following hard upon on another. In the first [A.re] carry
	// section of the code below I've marked such ops with an empty C-comment in the leftmost column:
	#define AVX_cmplx_carry_fast_errcheck_X16(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 8 prefetches-from-main-data-array spread through this macro */\
	/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
	[3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum
	[4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum
	[5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum
	[6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum
	*/\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0-f. Outputs into r0-f: */\
	/* Real parts use zmm0,2,4,6,8,10,12,14,16:				Imag parts use zmm1,3,5,7,9,11,13,15,17: */\
		"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
		"kmovw	%%eax,%%k1			\n\t"\
		/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
		could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
		"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
		"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
			"vmovq		%%rax,%%xmm0 		\n\t"\
			"vmovq		%%rbx,%%xmm1 		\n\t"\
			"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
			"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
	/********************************************************************/\
	/* Second 8-way transpose. Inputs from r10-1f. Outputs into r10-1f: */\
	/********************************************************************/\
		"addq		$0x400,%%rax		\n\t"\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		"subq		$0x400,%%rax			\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%zmm20	\n\t"/* prp_mult, broadcast to all double-slots of zmm20 */\
	/********** RCOL REGISTERS NUMBERED += 16 W.R.TO ANALOGOUS LCOL-REGISTERS **********/\
		"movq	%[__cy],%%rbx				\n\t"\
		"vmovaps	    (%%rbx),%%zmm1		\n\t	vmovaps	0x40(%%rbx),%%zmm17		\n\t"/* zmm1,17 = Our 2 eight-double cy_in[0:7][8:15]-vectors */\
		/* LOACC wts-data occupy 32 zmm-sized slots starting at (vec_dbl*)half_arr + 0 : */\
		"movq	%[__half_arr],%%rdi			\n\t	vmovaps -0x80(%%rdi),%%zmm2		\n\t"/* zmm2 = maxerr */\
		/* In AVX-512 mode, the 4 doubles base[0],baseinv[1],wts_mult[1],inv_mult[0] are in d0-3 slots
		of otherwise-unused sse2_rnd vec_dbl, that is in -0x40(rdi) + 0x[0,8,10,18]:: */\
		"vbroadcastsd -0x40(%%rdi),%%zmm10	\n\t"/* base   [0], PERSISTENT COPY IN zmm10 */\
		"vbroadcastsd -0x38(%%rdi),%%zmm11	\n\t"/* baseinv[1], PERSISTENT COPY IN zmm11 */\
		"vbroadcastsd -0x30(%%rdi),%%zmm12	\n\t"/* wts_mult[1], PERSISTENT COPY IN zmm12 */\
		"vbroadcastsd -0x28(%%rdi),%%zmm13	\n\t	vaddpd %%zmm13,%%zmm13,%%zmm14	\n\t"/* zmm13,14 have inv_mult[0] (needed for conditional-doubling), inv_mult[1] (needed for (wt_re >= inv_mult[1]) comparison) */\
		"movq	%[__sse_n] ,%%rbx			\n\t	vmovaps	(%%rbx),%%zmm15			\n\t"/* PERSISTENT COPY OF SSE_N  REMAINS IN zmm15. */\
		"movq	%[__sse_bw],%%rbx			\n\t"/* RBX HOLDS ADDRESS OF SSE_BW */\
		"movq	%[__bjmod_0],%%rsi			\n\t	vmovaps	(%%rsi),%%zmm3			\n\t"/* bjmod[0:15], PERSISTENT COPY IN zmm3 */\
		"movq	%[__sse_sw] ,%%rsi			\n\t	vmovaps	(%%rsi),%%zmm4			\n\t"/* sw,  16-fold PERSISTENT COPY IN zmm4 */\
		"movq	%[__sign_mask],%%rsi		\n\t	vmovaps %%zmm2,%%zmm18			\n\t"/* Rcol-copy of maxerr, allowing both cols to do independent updates with just one merge at end */\
	/**********************************/\
	/* Do A.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"prefetcht2	(%%r14)					\n\t"\
		"vmovaps	0x000(%%rax),%%zmm0		\n\t	vmovaps	0x400(%%rax),%%zmm16	\n\t"/* Load data */\
		"vmovaps	0x080(%%rdi),%%zmm6		\n\t	vmovaps	0x0c0(%%rdi),%%zmm22	\n\t"/* wi */\
		"vmovaps	0x000(%%rdi),%%zmm5		\n\t	vmovaps	0x040(%%rdi),%%zmm21	\n\t"/* wt for our 16 independent carry-chains */\
/**/	"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"/* x *= wtinv */\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t"/* Set bit in k1 if sw < bjmod[0:7] ; Opmask K1 is analog of AVX-mode bitmask stored in R10 */\
	"kmovw	%[__i],%%k2						\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing K1 (whose */\
	"kxorw	%%k2,%%k1,%%k1					\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"knotw	%%k1,%%k2						\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"/* Upper halves of above-computed 16-bit opmasks, used for rcol operands */\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
/**/	"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"/* temp = DNINT(x) */\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"/* [3] Fwd-base mults */\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"/* [4] Inv-base mults */\
/**/	"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"/* x - temp */\
/**/	"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"/* frac = fabs(x-temp) */\
/**/"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"/* cpy temp */\
/**/	"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"/* temp*baseinv */\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"/* K1,3 = (wt >= inv_mult[1]) [Do compare as (inv_mult[1] < wt)] */\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"/* K2,4 = inverse-masks */\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
/**/	"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"/* [5] [LOACC] wts_mult */\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"/* [6] [LOACC] inv_mult */\
/**/"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"/* x = (temp-cy*base) */\
/**/	"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
/**/	"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x000(%%rdi)	\n\t	vmovaps	%%zmm27,0x040(%%rdi)	\n\t"/* Store wt */\
		"vmovaps	%%zmm6 ,0x080(%%rdi)	\n\t	vmovaps	%%zmm22,0x0c0(%%rdi)	\n\t"/* Store wi */\
		/* x = (temp-cy*base[i])*wt: */\
/**/	"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"/* x *= wt */\
		/* Get ready for next set [IM0~] and store updated weights: */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"/* bjmod[0:15] += bw */\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"/* if(n > bjmod[0:15]) corr. bit in k1 set */\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"/* if(n > bjmod[0:15]) bjmod[0:15] -= n */\
		"vmovaps	%%zmm0 ,     (%%rax)	\n\t	vmovaps	%%zmm16,0x400(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"\n\t"\
	/**********************************/\
	/* Do A.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x040(%%rax),%%zmm0		\n\t	vmovaps	0x440(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x180(%%rdi),%%zmm6		\n\t	vmovaps	0x1c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x100(%%rdi),%%zmm5		\n\t	vmovaps	0x140(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x100(%%rdi)	\n\t	vmovaps	%%zmm27,0x140(%%rdi)	\n\t"/* Store wt_im */\
		"vmovaps	%%zmm6,0x180(%%rdi)		\n\t	vmovaps	%%zmm22,0x1c0(%%rdi)	\n\t"/* Store wi_im */\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x040(%%rax) 	\n\t	vmovaps	%%zmm16,0x440(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"\n\t"\
	/**********************************/\
	/* Do B.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x080(%%rax),%%zmm0		\n\t	vmovaps	0x480(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x280(%%rdi),%%zmm6		\n\t	vmovaps	0x2c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x200(%%rdi),%%zmm5		\n\t	vmovaps	0x240(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x200(%%rdi)	\n\t	vmovaps	%%zmm27,0x240(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x280(%%rdi)		\n\t	vmovaps	%%zmm22,0x2c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x080(%%rax) 	\n\t	vmovaps	%%zmm16,0x480(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x0c0(%%rax),%%zmm0		\n\t	vmovaps	0x4c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x380(%%rdi),%%zmm6		\n\t	vmovaps	0x3c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x300(%%rdi),%%zmm5		\n\t	vmovaps	0x340(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x300(%%rdi)	\n\t	vmovaps	%%zmm27,0x340(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x380(%%rdi)		\n\t	vmovaps	%%zmm22,0x3c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x0c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x4c0(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht2	(%%r14)					\n\t"\
		"vmovaps	0x100(%%rax),%%zmm0		\n\t	vmovaps	0x500(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x480(%%rdi),%%zmm6		\n\t	vmovaps	0x4c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x400(%%rdi),%%zmm5		\n\t	vmovaps	0x440(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x400(%%rdi)	\n\t	vmovaps	%%zmm27,0x440(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x480(%%rdi)		\n\t	vmovaps	%%zmm22,0x4c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x100(%%rax) 	\n\t	vmovaps	%%zmm16,0x500(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x140(%%rax),%%zmm0		\n\t	vmovaps	0x540(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x580(%%rdi),%%zmm6		\n\t	vmovaps	0x5c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x500(%%rdi),%%zmm5		\n\t	vmovaps	0x540(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x500(%%rdi)	\n\t	vmovaps	%%zmm27,0x540(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x580(%%rdi)		\n\t	vmovaps	%%zmm22,0x5c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x140(%%rax) 	\n\t	vmovaps	%%zmm16,0x540(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x180(%%rax),%%zmm0		\n\t	vmovaps	0x580(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x680(%%rdi),%%zmm6		\n\t	vmovaps	0x6c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x600(%%rdi),%%zmm5		\n\t	vmovaps	0x640(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x600(%%rdi)	\n\t	vmovaps	%%zmm27,0x640(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x680(%%rdi)		\n\t	vmovaps	%%zmm22,0x6c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x180(%%rax) 	\n\t	vmovaps	%%zmm16,0x580(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x1c0(%%rax),%%zmm0		\n\t	vmovaps	0x5c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x780(%%rdi),%%zmm6		\n\t	vmovaps	0x7c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x700(%%rdi),%%zmm5		\n\t	vmovaps	0x740(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x700(%%rdi)	\n\t	vmovaps	%%zmm27,0x740(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x780(%%rdi)		\n\t	vmovaps	%%zmm22,0x7c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x1c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x5c0(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht2	(%%r14)					\n\t"\
		"vmovaps	0x200(%%rax),%%zmm0		\n\t	vmovaps	0x600(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x880(%%rdi),%%zmm6		\n\t	vmovaps	0x8c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x800(%%rdi),%%zmm5		\n\t	vmovaps	0x840(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x800(%%rdi)	\n\t	vmovaps	%%zmm27,0x840(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x880(%%rdi)		\n\t	vmovaps	%%zmm22,0x8c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x200(%%rax) 	\n\t	vmovaps	%%zmm16,0x600(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x240(%%rax),%%zmm0		\n\t	vmovaps	0x640(%%rax),%%zmm16	\n\t"\
		"vmovaps	0x980(%%rdi),%%zmm6		\n\t	vmovaps	0x9c0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0x900(%%rdi),%%zmm5		\n\t	vmovaps	0x940(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0x900(%%rdi)	\n\t	vmovaps	%%zmm27,0x940(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0x980(%%rdi)		\n\t	vmovaps	%%zmm22,0x9c0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x240(%%rax) 	\n\t	vmovaps	%%zmm16,0x640(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x280(%%rax),%%zmm0		\n\t	vmovaps	0x680(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xa80(%%rdi),%%zmm6		\n\t	vmovaps	0xac0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xa00(%%rdi),%%zmm5		\n\t	vmovaps	0xa40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xa00(%%rdi)	\n\t	vmovaps	%%zmm27,0xa40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xa80(%%rdi)		\n\t	vmovaps	%%zmm22,0xac0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x280(%%rax) 	\n\t	vmovaps	%%zmm16,0x680(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x2c0(%%rax),%%zmm0		\n\t	vmovaps	0x6c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xb80(%%rdi),%%zmm6		\n\t	vmovaps	0xbc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xb00(%%rdi),%%zmm5		\n\t	vmovaps	0xb40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xb00(%%rdi)	\n\t	vmovaps	%%zmm27,0xb40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xb80(%%rdi)		\n\t	vmovaps	%%zmm22,0xbc0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x2c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x6c0(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht2	(%%r14)					\n\t"\
		"vmovaps	0x300(%%rax),%%zmm0		\n\t	vmovaps	0x700(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xc80(%%rdi),%%zmm6		\n\t	vmovaps	0xcc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xc00(%%rdi),%%zmm5		\n\t	vmovaps	0xc40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xc00(%%rdi)	\n\t	vmovaps	%%zmm27,0xc40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xc80(%%rdi)		\n\t	vmovaps	%%zmm22,0xcc0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x300(%%rax) 	\n\t	vmovaps	%%zmm16,0x700(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x340(%%rax),%%zmm0		\n\t	vmovaps	0x740(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xd80(%%rdi),%%zmm6		\n\t	vmovaps	0xdc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xd00(%%rdi),%%zmm5		\n\t	vmovaps	0xd40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xd00(%%rdi)	\n\t	vmovaps	%%zmm27,0xd40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xd80(%%rdi)		\n\t	vmovaps	%%zmm22,0xdc0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x340(%%rax) 	\n\t	vmovaps	%%zmm16,0x740(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x380(%%rax),%%zmm0		\n\t	vmovaps	0x780(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xe80(%%rdi),%%zmm6		\n\t	vmovaps	0xec0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xe00(%%rdi),%%zmm5		\n\t	vmovaps	0xe40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xe00(%%rdi)	\n\t	vmovaps	%%zmm27,0xe40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xe80(%%rdi)		\n\t	vmovaps	%%zmm22,0xec0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x380(%%rax) 	\n\t	vmovaps	%%zmm16,0x780(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im 16-tet: Data in m0,16: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht2	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x3c0(%%rax),%%zmm0		\n\t	vmovaps	0x7c0(%%rax),%%zmm16	\n\t"\
		"vmovaps	0xf80(%%rdi),%%zmm6		\n\t	vmovaps	0xfc0(%%rdi),%%zmm22	\n\t"\
		"vmovaps	0xf00(%%rdi),%%zmm5		\n\t	vmovaps	0xf40(%%rdi),%%zmm21	\n\t"\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t	vmulpd	%%zmm22,%%zmm16,%%zmm16	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"kshiftrw	$8,%%k1,%%k3			\n\t	kshiftrw	$8,%%k2,%%k4		\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vmovaps	%%zmm10,%%zmm23			\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vmovaps	%%zmm11,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t	vrndscalepd	$0,%%zmm16,%%zmm25	\n\t"\
	"vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%} \n\t vaddpd %%zmm23,%%zmm23,%%zmm23%{%%k3%}\n\t"\
	"vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k4%}\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t	vsubpd	%%zmm25,%%zmm16,%%zmm16	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t	vpandq	(%%rsi),%%zmm16,%%zmm16	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t vfmadd132pd %%zmm20,%%zmm17,%%zmm25\n\t"\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t	vmaxpd	%%zmm18,%%zmm16,%%zmm18	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t	vmovaps	%%zmm25,%%zmm16			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t	vmulpd	%%zmm24,%%zmm25,%%zmm25	\n\t"\
		/* Start update of weights while waiting for the paired MULPDs to complete: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	vcmppd	$1,%%zmm21,%%zmm14,%%k3	\n\t"\
		"knotw	%%k1,%%k2					\n\t	knotw	%%k3,%%k4				\n\t"\
		"vmovaps	%%zmm12,%%zmm26			\n\t	vmovaps	%%zmm12,%%zmm27			\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vmovaps	%%zmm13,%%zmm24			\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t	vrndscalepd	$0,%%zmm25,%%zmm17	\n\t"/* cy_out */\
	"vaddpd	%%zmm26,%%zmm26,%%zmm26%{%%k2%} \n\t vaddpd %%zmm27,%%zmm27,%%zmm27%{%%k4%}\n\t"\
	"vaddpd	%%zmm8 ,%%zmm8 ,%%zmm8 %{%%k1%} \n\t vaddpd %%zmm24,%%zmm24,%%zmm24%{%%k3%}\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t vfnmadd231pd	%%zmm23,%%zmm17,%%zmm16	\n\t"\
		"vmulpd		%%zmm26,%%zmm5,%%zmm26	\n\t	vmulpd	%%zmm27,%%zmm21,%%zmm27	\n\t"/* wt *= wts_mult[i] */\
		"vmulpd		%%zmm8 ,%%zmm6,%%zmm6	\n\t	vmulpd	%%zmm24,%%zmm22,%%zmm22	\n\t"/* wi *= inv_mult[i] */\
		"vmovaps	%%zmm26,0xf00(%%rdi)	\n\t	vmovaps	%%zmm27,0xf40(%%rdi)	\n\t"\
		"vmovaps	%%zmm6,0xf80(%%rdi)		\n\t	vmovaps	%%zmm22,0xfc0(%%rdi)	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd		%%zmm5 ,%%zmm0,%%zmm0 	\n\t	vmulpd	%%zmm21,%%zmm16,%%zmm16	\n\t"\
		/* Get ready for next set [RE8~] : */\
		"vpaddd		(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vpcmpgtd	%%zmm15,%%zmm3,%%k1		\n\t"\
		"vpsubd	%%zmm15,%%zmm3,%%zmm3%{%%k1%}\n\t"\
		"vmovaps	%%zmm0,0x3c0(%%rax) 	\n\t	vmovaps	%%zmm16,0x7c0(%%rax) 	\n\t"\
		"\n\t"\
		/* Store the bjmodn[0:15] index 16-tet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	vmovaps	%%zmm3,(%%rbx)	\n\t"\
		/* Store our pair of cy_out octets-of-doubles: */\
		"movq	%[__cy],%%rbx				\n\t"\
		"vmovaps		%%zmm1,(%%rbx)		\n\t	vmovaps	%%zmm17,0x40(%%rbx)		\n\t"\
		/* Store maxerr, after merging the separate lcol,rcol maxerr results: */\
		"vmaxpd	%%zmm2,%%zmm18,%%zmm2		\n\t	vmovaps	%%zmm2,-0x80(%%rdi)		\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately): Inputs from r0-f. Outputs into r0-f: */	\
		"movl	$0b00110011,%%eax	\n\t"/* Carry step has overwritten k1, re-init for transpose */\
		"kmovw	%%eax,%%k1			\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Outputs are now ordered - write 'em back to memory: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
	/********************************************************************/\
	/* Second 8-way transpose. Inputs from r10-1f. Outputs into r10-1f: */\
	/********************************************************************/\
		"addq		$0x400,%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Outputs are now ordered - write 'em back to memory: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23","xmm24","xmm25","xmm26","xmm27","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	#define AVX_cmplx_carry_fast_errcheck_X8(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 8 prefetches-from-main-data-array spread through this macro */\
	/* For the AVX-512 sans-table-lookup impl, Here are the needed consts and opmasks.
	[3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum
	[4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum
	[5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum
	[6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum
	*/\
	/* 8-way transpose of inputs (Re, Im parts separately) uses complex-ified version of algo in util.c:test_simd_transpose_8x8(). */\
	/* Inputs from r0-f. Outputs into r0-f: */\
		"movl	$0b00110011,%%eax	\n\t"/* Constant for vblendmpd instructions goes into mask-reg k1 */\
		"kmovw	%%eax,%%k1			\n\t"\
		/* Init vector index-consts needed by vpermt2pd instructions - if regs were at a premium,
		could also init just prior to [3] and use zmm6,7 to hold index-consts: */\
		"movq	$0x0c040e0608000a02,%%rax	\n\t"/* zmm30 = 8+4 0+4 8+6 0+6 8+0 0+0 8+2 0+2 [msw at left] */\
		"movq	$0x0d050f0709010b03,%%rbx	\n\t"/* zmm31 = 8+5 0+5 8+7 0+7 8+1 0+1 8+3 0+3 */\
			"vmovq		%%rax,%%xmm0 		\n\t"\
			"vmovq		%%rbx,%%xmm1 		\n\t"\
			"vpmovzxbq	%%xmm0,%%zmm30		\n\t"\
			"vpmovzxbq	%%xmm1,%%zmm31		\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Write original columns back as rows: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%zmm20	\n\t"/* prp_mult, broadcast to all double-slots of zmm20 */\
		"movq		%[__cy],%%rbx			\n\t	vmovaps	     (%%rbx),%%zmm1	\n\t"/* zmm1 = Our eight-double cy_in */\
		/* LOACC wts-data occupy 32 zmm-sized slots starting at (vec_dbl*)half_arr + 0 : */\
		"movq	%[__half_arr],%%rdi			\n\t	vmovaps -0x80(%%rdi),%%zmm2	\n\t"/* zmm2 = maxerr */\
		/* In AVX-512 mode, the 4 doubles base[0],baseinv[1],wts_mult[1],inv_mult[0] are in d0-3 slots
		of otherwise-unused sse2_rnd vec_dbl, that is in -0x40(rdi) + 0x[0,8,10,18]:: */\
		"vbroadcastsd -0x40(%%rdi),%%zmm10	\n\t"\
		"vbroadcastsd -0x38(%%rdi),%%zmm11	\n\t"\
		"vbroadcastsd -0x30(%%rdi),%%zmm12	\n\t"\
		"vbroadcastsd -0x28(%%rdi),%%zmm13	\n\t	vaddpd %%zmm13,%%zmm13,%%zmm14	\n\t"/* ymm13,14 have inv_mult[0] (needed for conditional-doubling), inv_mult[1] (needed for (wt_re >= inv_mult[1]) comparison) */\
		"movq	%[__sse_n] ,%%rbx			\n\t	vmovaps	(%%rbx),%%ymm15			\n\t"/* PERSISTENT COPY OF SSE_N  REMAINS IN ymm15. */\
		"movq	%[__sse_bw],%%rbx			\n\t"/* RBX HOLDS ADDRESS OF SSE_BW */\
		"movq	%[__bjmod_0],%%rsi			\n\t	vmovaps	(%%rsi),%%ymm3			\n\t"/* bjmod[0:7], PERSISTENT COPY IN ymm3 */\
		"movq	%[__sse_sw] ,%%rsi			\n\t	vmovaps	(%%rsi),%%ymm4			\n\t"/* sw,  8-fold PERSISTENT COPY IN ymm4 */\
		"movq	%[__sign_mask],%%rsi		\n\t"\
	/**********************************/\
	/* Do A.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
		"vmovaps	0x000(%%rax),%%zmm0		\n\t"\
		"vmovaps	0x080(%%rdi),%%zmm6		\n\t"/* wi_re - 8-way carry macro only uses every other one of these local-mem-slots */\
		"vmovaps	0x000(%%rdi),%%zmm5		\n\t"/* wt_re for our 8 independent carry-chains */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"/* x *= wtinv */\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t"/* Set bit in k1 if sw < bjmod[0:7] ; Opmask K1 is analog of AVX-mode bitmask stored in R10 */\
	"kmovw	%[__i],%%k2						\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing K1 (whose */\
	"kxorw	%%k2,%%k1,%%k1					\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"knotw	%%k1,%%k2						\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"/* [3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum: */\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"/* [4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum: */\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"/* x - temp */\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"/* cpy temp */\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"/* temp*baseinv */\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"/* x = (temp-cy*base) */\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"/* K1 = (wt_re >= inv_mult[1]) [Do compare as (inv_mult[1] < wt_re)]; K2 = inverse-mask */\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"/* [5] [LOACC] Init = wts_mult[1] x 8, anytime AVX-style lookup into 5th mini-table would have bit = 0, double the corr. datum: */\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"/* [6] [LOACC] Init = inv_mult[0] x 8, anytime AVX-style lookup into 6th mini-table would have bit = 1, double the corr. datum: */\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"/* x *= wt */\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"/* wi_re *= inv_mult[i] */\
		/* Get ready for next set [IM0~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"/* bjmod[0:7] += bw */\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"/* if(n > bjmod[0:7]) ymm7 = 11...11 */\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"/* if(n > bjmod[0:7]) ymm7 = n; otherwise 0 */\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"/* if(n > bjmod[0:7]) bjmod[0:7] -= n */\
		"vmovaps	%%zmm0,     (%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x000(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%zmm6,0x080(%%rdi)		\n\t"/* Store wi_re */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x040(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x180(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x100(%%rdi),%%zmm5		\n\t"/* wt_im for our 8 independent carry-chains */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"/* wi_im *= inv_mult[i] */\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x040(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"vmovaps	%%zmm5,0x100(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%zmm6,0x180(%%rdi)		\n\t"/* Store wi_im */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x080(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x280(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x200(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x080(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x200(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x280(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x0c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x380(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x300(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x0c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x300(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x380(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x100(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x480(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x400(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x100(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x400(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x480(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x140(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x580(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x500(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x140(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x500(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x580(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x180(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x680(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x600(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x180(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x600(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x680(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x1c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x780(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x700(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x1c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x700(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x780(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
		"vmovaps	0x200(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x880(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0x800(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM4~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x200(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x800(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x880(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do E.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x240(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0x980(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0x900(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE5~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x240(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0x900(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0x980(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x280(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xa80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xa00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM5~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x280(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xa00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xa80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do F.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x2c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xb80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xb00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE6~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x2c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xb00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xb80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x300(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xc80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xc00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM6~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x300(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xc00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xc80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do G.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x340(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xd80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xd00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE7~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x340(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xd00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xd80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x380(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xe80(%%rdi),%%zmm6		\n\t"/* wi_re */\
		"vmovaps	0xe00(%%rdi),%%zmm5		\n\t"/* wt_re */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [IM7~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x380(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xe00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xe80(%%rdi)		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do H.im-octet: Data in zmm0 :  */\
	/**********************************/\
		"vmovaps	0x3c0(%%rax),%%zmm0 	\n\t"\
		"vmovaps	0xf80(%%rdi),%%zmm6		\n\t"/* wi_im */\
		"vmovaps	0xf00(%%rdi),%%zmm5		\n\t"/* wt_im */\
		"vmulpd		%%zmm6,%%zmm0,%%zmm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vrndscalepd	$0,%%zmm0,%%zmm9	\n\t"\
		"vsubpd		%%zmm9,%%zmm0,%%zmm0	\n\t"\
		"vpandq		(%%rsi),%%zmm0,%%zmm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%zmm9,%%zmm0			\n\t"\
		"vmulpd		%%zmm8,%%zmm9,%%zmm9	\n\t"\
		"vrndscalepd	$0,%%zmm9,%%zmm1	\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%zmm7,%%zmm1,%%zmm0	\n\t"\
		/* Update and store weights: */\
		"vcmppd	$1,%%zmm5,%%zmm14,%%k1		\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm12,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k2%}	\n\t"\
		"vmovaps	%%zmm13,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k1%}	\n\t"\
		"vmulpd		%%zmm5,%%zmm0,%%zmm0 	\n\t"\
		"vmulpd		%%zmm7,%%zmm5,%%zmm5	\n\t"\
		"vmulpd		%%zmm8,%%zmm6,%%zmm6	\n\t"\
		/* Get ready for next set [RE8~] : */\
		"vpaddd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vpcmpgtd	%%ymm15,%%ymm3,%%ymm7	\n\t"\
		"vpand		%%ymm15,%%ymm7,%%ymm7	\n\t"\
		"vpsubd		%%ymm7 ,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%zmm0,0x3c0(%%rax) 	\n\t"\
		"vmovaps	%%zmm5,0xf00(%%rdi)		\n\t"\
		"vmovaps	%%zmm6,0xf80(%%rdi)		\n\t"\
		"\n\t"\
		/* Store the bjmodn[0:7] index octet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	vmovaps	%%ymm3,(%%rbx)	\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy] ,%%rbx			\n\t	vmovaps	%%zmm1,(%%rbx)	\n\t"\
		/* Store maxerr: */\
		"vmovaps	%%zmm2,-0x80(%%rdi)		\n\t"\
	/* 8-way transpose of outputs (Re, Im parts separately): Inputs from r0-f. Outputs into r0-f: */	\
		"movl	$0b00110011,%%eax	\n\t"/* Carry step has overwritten k1, re-init for transpose */\
		"kmovw	%%eax,%%k1			\n\t"\
		"movq		%[__data],%%rax		\n\t"\
		/* Read in the 8 rows of our input matrix: */\
		"vmovaps		0x000(%%rax),%%zmm0					\n\t	vmovaps		0x040(%%rax),%%zmm12		\n\t"\
		"vmovaps		0x080(%%rax),%%zmm1					\n\t	vmovaps		0x0c0(%%rax),%%zmm13		\n\t"\
		"vmovaps		0x100(%%rax),%%zmm2					\n\t	vmovaps		0x140(%%rax),%%zmm14		\n\t"\
		"vmovaps		0x180(%%rax),%%zmm3					\n\t	vmovaps		0x1c0(%%rax),%%zmm15		\n\t"\
		"vmovaps		0x200(%%rax),%%zmm4					\n\t	vmovaps		0x240(%%rax),%%zmm16		\n\t"\
		"vmovaps		0x280(%%rax),%%zmm5					\n\t	vmovaps		0x2c0(%%rax),%%zmm17		\n\t"\
		"vmovaps		0x300(%%rax),%%zmm6					\n\t	vmovaps		0x340(%%rax),%%zmm18		\n\t"\
		"vmovaps		0x380(%%rax),%%zmm7					\n\t	vmovaps		0x3c0(%%rax),%%zmm19		\n\t"\
		/* [1] Shuffle the 4-aparts - note the different patterning of the first and second output quartet: */\
		"vshuff64x2	$0b01000100, %%zmm4,	%%zmm0,	%%zmm8 	\n\t	vshuff64x2	$0b01000100, %%zmm16,%%zmm12,	%%zmm20	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm4,	%%zmm0,	%%zmm4 	\n\t	vshuff64x2	$0b11101110, %%zmm16,%%zmm12,	%%zmm16	\n\t"\
		"vshuff64x2	$0b01000100, %%zmm5,	%%zmm1,	%%zmm9	\n\t	vshuff64x2	$0b01000100, %%zmm17,%%zmm13,	%%zmm21	\n\t"\
		"vshuff64x2	$0b11101110, %%zmm5,	%%zmm1,	%%zmm5	\n\t	vshuff64x2	$0b11101110, %%zmm17,%%zmm13,	%%zmm17	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm6,	%%zmm2,	%%zmm10	\n\t	vshuff64x2	$0b00010001, %%zmm18,%%zmm14,	%%zmm22	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm6,	%%zmm2,	%%zmm6	\n\t	vshuff64x2	$0b10111011, %%zmm18,%%zmm14,	%%zmm18	\n\t"\
		"vshuff64x2	$0b00010001, %%zmm7,	%%zmm3,	%%zmm11	\n\t	vshuff64x2	$0b00010001, %%zmm19,%%zmm15,	%%zmm23	\n\t"\
		"vshuff64x2	$0b10111011, %%zmm7,	%%zmm3,	%%zmm7	\n\t	vshuff64x2	$0b10111011, %%zmm19,%%zmm15,	%%zmm19	\n\t"\
		/* [2] Blend in the 2-aparts */\
		"vblendmpd	%%zmm8 ,	%%zmm10,	%%zmm0%{%%k1%}	\n\t	vblendmpd	%%zmm20,	%%zmm22,	%%zmm12%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm10,	%%zmm8 ,	%%zmm8%{%%k1%}	\n\t	vblendmpd	%%zmm22,	%%zmm20,	%%zmm20%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm4 ,	%%zmm6 ,	%%zmm1%{%%k1%}	\n\t	vblendmpd	%%zmm16,	%%zmm18,	%%zmm13%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm6 ,	%%zmm4 ,	%%zmm4%{%%k1%}	\n\t	vblendmpd	%%zmm18,	%%zmm16,	%%zmm16%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm9 ,	%%zmm11,	%%zmm2%{%%k1%}	\n\t	vblendmpd	%%zmm21,	%%zmm23,	%%zmm14%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm11,	%%zmm9 ,	%%zmm9%{%%k1%}	\n\t	vblendmpd	%%zmm23,	%%zmm21,	%%zmm21%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm5 ,	%%zmm7 ,	%%zmm3%{%%k1%}	\n\t	vblendmpd	%%zmm17,	%%zmm19,	%%zmm15%{%%k1%}	\n\t"\
		"vblendmpd	%%zmm7 ,	%%zmm5 ,	%%zmm5%{%%k1%}	\n\t	vblendmpd	%%zmm19,	%%zmm17,	%%zmm17%{%%k1%}	\n\t"\
		/* [3] Shuffle or permute in the 1-aparts */\
		"vshufpd	$0b00000000,%%zmm2,		%%zmm0,%%zmm10 	\n\t	vshufpd	$0b00000000,%%zmm14,	%%zmm12,%%zmm22	\n\t"\
		"vshufpd	$0b11111111,%%zmm2,		%%zmm0,%%zmm11 	\n\t	vshufpd	$0b11111111,%%zmm14,	%%zmm12,%%zmm23	\n\t"\
		"vmovapd	%%zmm8,%%zmm2							\n\t	vmovapd	%%zmm20,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm30,%%zmm2 	\n\t	vpermt2pd				%%zmm21,	%%zmm30,%%zmm14	\n\t"\
		"vpermt2pd				%%zmm9,		%%zmm31,%%zmm8	\n\t	vpermt2pd				%%zmm21,	%%zmm31,%%zmm20	\n\t"\
		"vshufpd	$0b00000000,%%zmm3,		%%zmm1,%%zmm0 	\n\t	vshufpd	$0b00000000,%%zmm15,	%%zmm13,%%zmm12	\n\t"\
		"vshufpd	$0b11111111,%%zmm3,		%%zmm1,%%zmm1 	\n\t	vshufpd	$0b11111111,%%zmm15,	%%zmm13,%%zmm13	\n\t"\
		"vmovapd	%%zmm4,%%zmm3							\n\t	vmovapd	%%zmm16,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm30,%%zmm3 	\n\t	vpermt2pd				%%zmm17,	%%zmm30,%%zmm15	\n\t"\
		"vpermt2pd				%%zmm5,		%%zmm31,%%zmm4	\n\t	vpermt2pd				%%zmm17,	%%zmm31,%%zmm16	\n\t"\
		/* Outputs are now ordered - write 'em back to memory: */\
		"vmovaps		%%zmm10,0x000(%%rax)				\n\t	vmovaps		%%zmm22,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm11,0x080(%%rax)				\n\t	vmovaps		%%zmm23,0x0c0(%%rax)		\n\t"\
		"vmovaps		%%zmm2 ,0x100(%%rax)				\n\t	vmovaps		%%zmm14,0x140(%%rax)		\n\t"\
		"vmovaps		%%zmm8 ,0x180(%%rax)				\n\t	vmovaps		%%zmm20,0x1c0(%%rax)		\n\t"\
		"vmovaps		%%zmm0 ,0x200(%%rax)				\n\t	vmovaps		%%zmm12,0x240(%%rax)		\n\t"\
		"vmovaps		%%zmm1 ,0x280(%%rax)				\n\t	vmovaps		%%zmm13,0x2c0(%%rax)		\n\t"\
		"vmovaps		%%zmm3 ,0x300(%%rax)				\n\t	vmovaps		%%zmm15,0x340(%%rax)		\n\t"\
		"vmovaps		%%zmm4, 0x380(%%rax)				\n\t	vmovaps		%%zmm16,0x3c0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm16","xmm17","xmm18","xmm19","xmm20","xmm21","xmm22","xmm23","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	// For carry-step radices == 4 (mod 8) need an AVX-512 version of the AVX_cmplx_carry_fast_errcheck_X4 macro,
	// designed to be called twice sequentially, the first time processing the j+[0,2,4,6] data, the 2nd time the j+[8,10,12,14].
	// Call 1 feeds the d0-3 data of the re.d0-7,im.d0-7 vector-doubles to the 4x4 transposes; Call 2 uses the d4-7 data.
	// Notes:
	// [1] Since this avx-512 version only used for final-4-sets-of-carries cleanup, no need for avx version's __i argument.
	// [2] Call 2 of this macro must have main-data pointers += 0x20 w.r.to Call 1, and weights-data pointers += 0x400.
	// [3] Carry and bjmod-quartets pointers are same for Call 1 and Call 2, since Call 2 propagates the same CY-quartet through the j+[8,10,12,14] data.
	// [4] Call 2 has weights-data pointers += 0x400; this byte offset is specified via the [doff] argument.
	// [5] Don't need to update/write the wts-data due to the final-cleanup-pass nature of this 4-way carry macro.
	#define AVX_cmplx_carry_fast_errcheck_X4(Xdata,Xcy,Xbjmod_0, Xhalf_arr,Xdoff, Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"vbroadcastsd	(%%rax),%%zmm20	\n\t"/* In order ot make use of vec-reg w/index > 15, MUST USE zmm20, I.E. IN FULL-WIDTH MODE */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	     (%%rax),%%xmm0				\n\t	vmovaps	0x040(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x080(%%rax),%%xmm8				\n\t	vmovaps	0x0c0(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x100(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0x140(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0x180(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0x1c0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	/* Next set of byte offsets are += 0x10 w.r.to the above ones, same as in AVX/AVX2 version: */\
	"vmovaps	0x010(%%rax),%%xmm4				\n\t	vmovaps	0x050(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x090(%%rax),%%xmm8				\n\t	vmovaps	0x0d0(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x110(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0x150(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0x190(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0x1d0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm1-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,     (%%rax)		\n\t	vmovaps	%%ymm1 ,0x040(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x080(%%rax)		\n\t	vmovaps	%%ymm3 ,0x0c0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x100(%%rax)		\n\t	vmovaps	%%ymm5 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0x180(%%rax)		\n\t	vmovaps	%%ymm7 ,0x1c0(%%rax)		\n\t"\
	/*** The actual carry stuff bookended by the 4x4 transposes based on the avx-512 8-way carry, but with reg-data half-sized as
	needed. That means more or less everything but the opmask stuff, which in AVX-512F requires full-width 512-bit registers: ***/\
		"movq		%[__cy],%%rbx			\n\t	vmovaps	     (%%rbx),%%ymm1	\n\t"/* ymm1 = Our four-double cy_in */\
		"movq	%[__half_arr],%%rdi			\n\t	vmovaps -0x80(%%rdi),%%ymm2	\n\t"/* ymm2 = maxerr */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 0. Need to include the Call 1|2 byte offset here : */\
		"leaq	%c[__doff](%%rdi),%%rdx		\n\t"\
		/* In AVX-512 mode, the 4 doubles base[0],baseinv[1],wts_mult[1],inv_mult[0] are in d0-3 slots
		of otherwise-unused sse2_rnd vec_dbl, that is in -0x40(rdi) + 0x[0,8,10,18]: */\
		"vbroadcastsd -0x40(%%rdi),%%ymm10	\n\t"\
		"vbroadcastsd -0x38(%%rdi),%%ymm11	\n\t"\
		"vbroadcastsd -0x30(%%rdi),%%ymm12	\n\t"\
		"vbroadcastsd -0x28(%%rdi),%%ymm13	\n\t	vaddpd %%ymm13,%%ymm13,%%ymm14	\n\t"/* ymm13,14 have inv_mult[0] (needed for conditional-doubling), inv_mult[1] (needed for (wt_re >= inv_mult[1]) comparison) */\
		"movq	%[__sse_n] ,%%rbx			\n\t	vmovaps	(%%rbx),%%xmm15			\n\t"/* PERSISTENT COPY OF SSE_N  REMAINS IN xmm15. */\
		"movq	%[__sse_bw],%%rbx			\n\t"/* RBX HOLDS ADDRESS OF SSE_BW */\
		"movq	%[__bjmod_0],%%rsi			\n\t	vmovaps	(%%rsi),%%xmm3			\n\t"/* bjmod[0:3], PERSISTENT COPY IN xmm3 */\
		"movq	%[__sse_sw] ,%%rsi			\n\t	vmovaps	(%%rsi),%%xmm4			\n\t"/* sw,  8-fold PERSISTENT COPY IN xmm4 */\
		"movq	%[__sign_mask],%%rsi		\n\t"\
	/**********************************/\
	/* Do A.re-octet: Data in zmm0 :  */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
		"vmovaps	0x000(%%rax),%%ymm0		\n\t"\
		"vmovaps	0x080(%%rdx),%%ymm6		\n\t"/* wi_re */\
		"vmovaps	0x000(%%rdx),%%ymm5		\n\t"/* wt_re for our 8 independent carry-chains */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t"/* Set bit in k1 if sw < bjmod[0:3] ; Opmask K1 is analog of AVX-mode bitmask stored in R10 */\
	"knotw	%%k1,%%k2						\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"/* [3] Fwd-base mults: Init = base[0] x 8, anytime AVX-style lookup into 3rd mini-table would have bit = 1, double the corr. datum: */\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"/* [4] Inv-base mults: Init = binv[1] x 8, anytime AVX-style lookup into 4th mini-table would have bit = 0, double the corr. datum: */\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) ... PANDQ requires full-width reg in AVX-512F so revert to ANDPD here. */\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy *//* Again, MUST USE MM20 IN FULL-WIDTH MODE */\
	/* vmaxpd: AVX-512F requires full-width zmm-regs here, but only use lower 256 bits: */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"/* cpy temp */\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"/* x *= wt */\
		/* Get ready for next set [IM0~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"/* bjmod[0:3] += bw */\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"/* if(n > bjmod[0:3]) xmm7 = 11...11 */\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"/* if(n > bjmod[0:3]) xmm7 = n; otherwise 0 */\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"vmovaps	%%ymm0,     (%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0 :  */\
	/**********************************/\
		"vmovaps	0x040(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x180(%%rdx),%%ymm6		\n\t"/* wi_im */\
		"vmovaps	0x100(%%rdx),%%ymm5		\n\t"/* wt_im for our 8 independent carry-chains */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x040(%%rax) 	\n\t"/* Store normalized, fwd-weighted datum */\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0 :  */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x080(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x280(%%rdx),%%ymm6		\n\t"/* wi_re */\
		"vmovaps	0x200(%%rdx),%%ymm5		\n\t"/* wt_re */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x080(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0 :  */\
	/**********************************/\
		"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x380(%%rdx),%%ymm6		\n\t"/* wi_im */\
		"vmovaps	0x300(%%rdx),%%ymm5		\n\t"/* wt_im */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [RE2~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x0c0(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0 :  */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x100(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x480(%%rdx),%%ymm6		\n\t"/* wi_re */\
		"vmovaps	0x400(%%rdx),%%ymm5		\n\t"/* wt_re */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [IM2~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x100(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0 :  */\
	/**********************************/\
		"vmovaps	0x140(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x580(%%rdx),%%ymm6		\n\t"/* wi_im */\
		"vmovaps	0x500(%%rdx),%%ymm5		\n\t"/* wt_im */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [RE3~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x140(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0 :  */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
		"vmovaps	0x180(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x680(%%rdx),%%ymm6		\n\t"/* wi_re */\
		"vmovaps	0x600(%%rdx),%%ymm5		\n\t"/* wt_re */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [IM3~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x180(%%rax) 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0 :  */\
	/**********************************/\
		"vmovaps	0x1c0(%%rax),%%ymm0 	\n\t"\
		"vmovaps	0x780(%%rdx),%%ymm6		\n\t"/* wi_im */\
		"vmovaps	0x700(%%rdx),%%ymm5		\n\t"/* wt_im */\
		"vmulpd		%%ymm6,%%ymm0,%%ymm0	\n\t"\
	"vpcmpd	$1,%%zmm3,%%zmm4,%%k1			\n\t	knotw	%%k1,%%k2	\n\t"\
		"vmovaps	%%zmm10,%%zmm7			\n\t	vaddpd	%%zmm7,%%zmm7,%%zmm7%{%%k1%}	\n\t"\
		"vmovaps	%%zmm11,%%zmm8			\n\t	vaddpd	%%zmm8,%%zmm8,%%zmm8%{%%k2%}	\n\t"\
		"vroundpd	$0,%%ymm0,%%ymm9		\n\t"\
		"vsubpd		%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vandpd		(%%rsi),%%ymm0,%%ymm0	\n\t"\
	"vfmadd132pd	%%zmm20,%%zmm1,%%zmm9	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%zmm2,%%zmm0,%%zmm2	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"vmovaps	%%ymm9,%%ymm0			\n\t"\
		"vmulpd		%%ymm8,%%ymm9,%%ymm9	\n\t"\
		"vroundpd	$0,%%ymm9,%%ymm1		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd	%%ymm7,%%ymm1,%%ymm0	\n\t"\
		"vmulpd		%%ymm5,%%ymm0,%%ymm0 	\n\t"\
		/* Get ready for next set [RE4~] : */\
		"vpaddd		(%%rbx),%%xmm3,%%xmm3	\n\t"\
		"vpcmpgtd	%%xmm15,%%xmm3,%%xmm7	\n\t"\
		"vpand		%%xmm15,%%xmm7,%%xmm7	\n\t"\
		"vpsubd		%%xmm7 ,%%xmm3,%%xmm3	\n\t"\
		"vmovaps	%%ymm0,0x1c0(%%rax) 	\n\t"\
		"\n\t"\
		/* Store the bjmodn[0:3] index octet: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	vmovaps	%%xmm3,(%%rbx)	\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy] ,%%rbx			\n\t	vmovaps	%%ymm1,(%%rbx)	\n\t"\
		/* Store maxerr: */\
		"vmovaps	%%ymm2,-0x80(%%rdi)		\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	     (%%rax),%%xmm0				\n\t	vmovaps	0x040(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x080(%%rax),%%xmm8				\n\t	vmovaps	0x0c0(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x100(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0x140(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0x180(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0x1c0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x010(%%rax),%%xmm4				\n\t	vmovaps	0x050(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x090(%%rax),%%xmm8				\n\t	vmovaps	0x0d0(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x110(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0x150(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0x190(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0x1d0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,     (%%rax)		\n\t	vmovaps	%%ymm1 ,0x040(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x080(%%rax)		\n\t	vmovaps	%%ymm3 ,0x0c0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0x180(%%rax)		\n\t	vmovaps	%%ymm7 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x100(%%rax)		\n\t	vmovaps	%%ymm5 ,0x140(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Literal-byte offset for weights data - can't simply add this to the input half_arr ptr since need unmodified value of that for aux-consts */\
		, [__doff]		"e" (Xdoff)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm20"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX2)	// FMA-using versions of the 8-way and 4-way macros def'd for AVX:

	// 8-way version of the AVX 4-way carry macro,
	// analogous to the 128-bit-setting 4-way SSE2_cmplx_carry_fast_errcheck macro:
	#define AVX_cmplx_carry_fast_errcheck_X8(Xdata,XcyA,XcyB,Xbjmod_0,Xbjmod_4,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		/* And un-fiddle the base address: */\
		"subq	$0x100,%%rax				\n\t"\
	/*** mm6-9 *FREE* between here and closing un-transpose block ... each processing ***/\
	/*** column below uses 5 vector registers, making it tempting to add a 3rd column ***/\
	/*** In this version of the carry macro use 1 of the free vec-regs for bjmod[4:7] ***/\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	(%%rbx),%%ymm12			\n\t	vmovaps	(%%rcx),%%ymm13	\n\t"/* ymm12,13 = Our pair of four-double cy_ins */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi			\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14		\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"vmovaps	     (%%rax),%%ymm0 	\n\t	vmovaps	0x100(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__bjmod_0],%%rsi			\n\t	movq	%[__bjmod_4],%%rcx	\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t	vmovaps	(%%rcx),%%xmm6		\n\t"/* bjmod[0:3] and [4:7], persistent copies in xmm15,xmm6, resp. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"movslq	%[__i]	,%%rbx				\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%r10				\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	/* lcol = <0:3> << 5, rcol = <4:7> << 5 (shift = 5 to give ptr offset for ymm-size data: */\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t	vmovaps	0xc60(%%rdi),%%ymm5 		\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t	vmovaps	0xc20(%%rdi),%%ymm3 		\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"/* cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"/* Extract cmp-results into 8-bit signmask */\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,     (%%rax)	\n\t	vmovaps	%%ymm1 ,0x100(%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xc20(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xc60(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"/* bjmod[0:7] += bw  */\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"/* if(n > bjmod[0:7]) xmm1 = 11...11 */\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"/* if(n > bjmod[0:7]) bjmod[0:7] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x020(%%rax),%%ymm0 	\n\t	vmovaps	0x120(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xce0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t	vmovaps	0xca0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x020(%%rax)	\n\t	vmovaps	%%ymm1 ,0x120(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xca0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xce0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x040(%%rax),%%ymm0 	\n\t	vmovaps	0x140(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t	vmovaps	0xd60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t	vmovaps	0xd20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x040(%%rax)	\n\t	vmovaps	%%ymm1 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xd20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xd60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x060(%%rax),%%ymm0 	\n\t	vmovaps	0x160(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xde0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t	vmovaps	0xda0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x060(%%rax)	\n\t	vmovaps	%%ymm1 ,0x160(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xda0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xde0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
	"vmovaps	0x080(%%rax),%%ymm0 	\n\t	vmovaps	0x180(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t	vmovaps	0xe60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t	vmovaps	0xe20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)	\n\t	vmovaps	%%ymm1 ,0x180(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xe20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xe60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0a0(%%rax),%%ymm0 	\n\t	vmovaps	0x1a0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t	vmovaps	0xee0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t	vmovaps	0xea0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0a0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1a0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xea0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xee0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t	vmovaps	0x1c0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t	vmovaps	0xf60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t	vmovaps	0xf20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0c0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xf20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xf60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm0 	\n\t	vmovaps	0x1e0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xfe0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t	vmovaps	0xfa0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t	vfmadd132pd	%%ymm8,%%ymm13,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm12\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm13	\n\t"\
		"vroundpd	$0,%%ymm12,%%ymm12		\n\t	vroundpd	$0,%%ymm13,%%ymm13		\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
	"vfnmadd231pd 0x400(%%r10),%%ymm12,%%ymm0 \n\t vfnmadd231pd	0x400(%%rsi),%%ymm13,%%ymm1	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0e0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1e0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xfa0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xfe0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
		/* Store the two bjmodn index quartets: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	movq	%[__bjmod_4],%%rcx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t	vmovaps	%%xmm6 ,(%%rcx)			\n\t"\
		/* Store cy_outs: */\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)			\n\t	vmovaps	%%ymm13,(%%rcx)	\n\t"/* ymm12,13 = Our pair of four-double cy_outs */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__bjmod_4]	"m" (Xbjmod_4)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

	// Register-name choices in the non-transpose middle section reflect preparation for an 8-way version of this macro:
	#define AVX_cmplx_carry_fast_errcheck_X4(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm1-7 back to memory to free up vector registers: */\
		"												vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		"movq		%[__cy],%%rbx		\n\t"\
		"vmovaps	(%%rbx),%%ymm12		\n\t"/* ymm12 = cy_in */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14	\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"movq	%[__bjmod_0],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm15. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm10			\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rbx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		/* This 4-way vector-carry macro uses only the even-indexed 16 of said slots, having byte offsets == 0 (mod 0x40) */\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] += bw  */\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x20(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi		\n\t"\
	"vmovaps	(%%rsi),	%%xmm10		\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x20(%%rax)		\n\t"/* Store A.im to free up a register */\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x40(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x40(%%rax)		\n\t"/* Store B.re to free up a register */\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x60(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x60(%%rax)		\n\t"/* Store B.im to free up a register */\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE2~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x80(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t"/* Store C.re to free up a register */\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM2~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xa0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xa0(%%rax)		\n\t"/* Store C.im to free up a register */\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE3~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xc0(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xc0(%%rax)		\n\t"/* Store D.re to free up a register */\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM3~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xe0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
	"vfmadd132pd	%%ymm8,%%ymm12,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
	"vfnmadd231pd	0x400(%%rsi),%%ymm12,%%ymm0 \n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xe0(%%rax)		\n\t"/* Store D.im to free up a register */\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rbx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)	\n\t"/* cy_out = ymm12 */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm12","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX)

	// 8-way version of the AVX 4-way carry macro,
	// analogous to the 128-bit-setting 4-way SSE2_cmplx_carry_fast_errcheck macro:
	#define AVX_cmplx_carry_fast_errcheck_X8(Xdata,XcyA,XcyB,Xbjmod_0,Xbjmod_4,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xp4, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		/* And un-fiddle the base address: */\
		"subq	$0x100,%%rax				\n\t"\
	/*** mm6-9 *FREE* between here and closing un-transpose block ... each processing ***/\
	/*** column below uses 5 vector registers, making it tempting to add a 3rd column ***/\
	/*** In this version of the carry macro use 1 of the free vec-regs for bjmod[4:7] ***/\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	(%%rbx),%%ymm12			\n\t	vmovaps	(%%rcx),%%ymm13	\n\t"/* ymm12,13 = Our pair of four-double cy_ins */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi			\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14		\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"vmovaps	     (%%rax),%%ymm0 	\n\t	vmovaps	0x100(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__bjmod_0],%%rsi			\n\t	movq	%[__bjmod_4],%%rcx	\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t	vmovaps	(%%rcx),%%xmm6		\n\t"/* bjmod[0:3] and [4:7], persistent copies in xmm15,xmm6, resp. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"movslq	%[__i]	,%%rbx				\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%r10				\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	/* byte offsets for bits <0:3> and <4:7> go into r10, rsi, respectively. */\
	/* lcol = <0:3> << 5, rcol = <4:7> << 5 (shift = 5 to give ptr offset for ymm-size data: */\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t	vmovaps	0xc60(%%rdi),%%ymm5 		\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t	vmovaps	0xc20(%%rdi),%%ymm3 		\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"/* temp = DNINT(x) */\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"/* temp*baseinv */\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"/* cy_out */\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"/* cy*base */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"/* x = (temp-cy*base) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"/* Extract cmp-results into 8-bit signmask */\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,     (%%rax)	\n\t	vmovaps	%%ymm1 ,0x100(%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xc20(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xc60(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"/* bjmod[0:7] += bw  */\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"/* if(n > bjmod[0:7]) xmm1 = 11...11 */\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"/* if(n > bjmod[0:7]) bjmod[0:7] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x020(%%rax),%%ymm0 	\n\t	vmovaps	0x120(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"/* sw (two 4-fold copies) */\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"/* sw - bjmod[0:3],[4:7] */\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xce0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t	vmovaps	0xca0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x020(%%rax)	\n\t	vmovaps	%%ymm1 ,0x120(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xca0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xce0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x040(%%rax),%%ymm0 	\n\t	vmovaps	0x140(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t	vmovaps	0xd60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t	vmovaps	0xd20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x040(%%rax)	\n\t	vmovaps	%%ymm1 ,0x140(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xd20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xd60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x060(%%rax),%%ymm0 	\n\t	vmovaps	0x160(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xde0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t	vmovaps	0xda0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x060(%%rax)	\n\t	vmovaps	%%ymm1 ,0x160(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xda0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xde0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p4],%%r15			\n\t"\
	"leaq		(%%r14,%%r15,8),%%r14	\n\t"\
	"prefetcht0	(%%r14)					\n\t"\
	"vmovaps	0x080(%%rax),%%ymm0 	\n\t	vmovaps	0x180(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t	vmovaps	0xe60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t	vmovaps	0xe20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)	\n\t	vmovaps	%%ymm1 ,0x180(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xe20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xe60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p1],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0a0(%%rax),%%ymm0 	\n\t	vmovaps	0x1a0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t	vmovaps	0xee0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t	vmovaps	0xea0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0a0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1a0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xea0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xee0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p2],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0c0(%%rax),%%ymm0 	\n\t	vmovaps	0x1c0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t	vmovaps	0xf60(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t	vmovaps	0xf20(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* x = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0c0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1c0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xf20(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xf60(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-octet: Data in ymm0-1: */\
	/**********************************/\
	"movslq		%[__p3],%%r15			\n\t"\
	"prefetcht0	(%%r14,%%r15,8)			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm0 	\n\t	vmovaps	0x1e0(%%rax),%%ymm1 \n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	/* Extract sign bits into pair of 4-bit signmasks in r10 and rsi, idxs into base/inv table */\
	"vmovaps	(%%rsi),%%xmm10			\n\t	vmovaps	%%xmm10,%%xmm11		\n\t"\
	"vpsubd	%%xmm15,%%xmm10,%%xmm10		\n\t vpsubd	%%xmm6 ,%%xmm11,%%xmm11	\n\t"\
	"vmovmskps	%%xmm10,%%r10			\n\t	vmovmskps	%%xmm11,%%rsi	\n\t"\
	"shlq	 $5,	%%r10				\n\t	shlq	 $5,	%%rsi		\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t	vmovaps	0xfe0(%%rdi),%%ymm5 		\n\t"\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t	vmovaps	0xfa0(%%rdi),%%ymm3 		\n\t"\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t	vmulpd		%%ymm5 ,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	%%ymm0,%%ymm10			\n\t	vmovaps	%%ymm1,%%ymm11				\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t	vroundpd	$0,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t	vsubpd		%%ymm11,%%ymm1,%%ymm1	\n\t"\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t	vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t	vmulpd		%%ymm8 ,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vaddpd		%%ymm13,%%ymm11,%%ymm11	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t	vmaxpd		%%ymm14,%%ymm1,%%ymm14	\n\t"\
		/* cy = DNINT(temp*baseinv[i]): */\
		"addq	%%rdi,%%r10					\n\t	addq	%%rdi,%%rsi					\n\t"\
		"vmovaps	%%ymm10	,%%ymm0			\n\t	vmovaps	%%ymm11	,%%ymm1				\n\t"\
		"vmulpd	0x600(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x600(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm12		\n\t	vroundpd	$0,%%ymm11,%%ymm13		\n\t"\
		"vmovaps	%%ymm12,%%ymm10			\n\t	vmovaps	%%ymm13,%%ymm11				\n\t"\
		/* y = (temp-cy*base[i])*wt: */\
		"vmulpd	0x400(%%r10),%%ymm10,%%ymm10\n\t	vmulpd 0x400(%%rsi),%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm11,%%ymm1 ,%%ymm1 	\n\t"\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vmulpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t"\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10	\n\t	vmovaps	 %%ymm10,%%ymm11			\n\t"\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd	$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t	vcmppd	$1,%%ymm3 ,%%ymm11,%%ymm11	\n\t"\
		"vmovmskpd	%%ymm10,%%r11			\n\t	vmovmskpd	%%ymm11,%%rdx			\n\t"\
		"shlq			$5,%%r11			\n\t	shlq			$5,%%rdx			\n\t"\
		"leaq	(%%rdi,%%r11),%%r11			\n\t	leaq	(%%rdi,%%rdx),%%rdx			\n\t"\
		"vmulpd 0x800(%%r11),%%ymm2 ,%%ymm2 \n\t	vmulpd 0x800(%%rdx),%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd 0xa00(%%r11),%%ymm4 ,%%ymm4 \n\t	vmulpd 0xa00(%%rdx),%%ymm5 ,%%ymm5 	\n\t"\
		"vmovaps	%%ymm0 ,0x0e0(%%rax)	\n\t	vmovaps	%%ymm1 ,0x1e0(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)	\n\t	vmovaps	%%ymm3 ,0xfa0(%%rdi)		\n\t"\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)	\n\t	vmovaps	%%ymm5 ,0xfe0(%%rdi)		\n\t"\
		/* Get ready for next set: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t	vmovaps		%%xmm2	,%%xmm3		\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
	"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t	vpaddd		(%%rcx)	,%%xmm6	,%%xmm6		\n\t"\
	"vmovaps	%%xmm15	,%%xmm10			\n\t	vmovaps		%%xmm6	,%%xmm11			\n\t"\
	"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t	vpcmpgtd	%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t	vpand		%%xmm3	,%%xmm11,%%xmm11	\n\t"\
	"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t	vpsubd		%%xmm11	,%%xmm6	,%%xmm6		\n\t"\
		"\n\t"\
		/* Store the two bjmodn index quartets: */\
		"movq	%[__bjmod_0],%%rbx			\n\t	movq	%[__bjmod_4],%%rcx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t	vmovaps	%%xmm6 ,(%%rcx)			\n\t"\
		/* Store cy_outs: */\
		"movq		%[__cyA],%%rbx			\n\t	movq	%[__cyB],%%rcx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)			\n\t	vmovaps	%%ymm13,(%%rcx)	\n\t"/* ymm12,13 = Our pair of four-double cy_outs */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
	/* Transpose of second complex vector-data quartet: */\
		"addq	$0x100,%%rax		\n\t"\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm0-7 back to memory to free up vector registers: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__bjmod_4]	"m" (Xbjmod_4)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__p4]   "m" (Xp4)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

	// Register-name choices in the non-transpose middle section reflect preparation for an 8-way version of this macro:
	#define AVX_cmplx_carry_fast_errcheck_X4(Xdata,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Write ymm1-7 back to memory to free up vector registers: */\
		"												vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm8	\n\t"/* prp_mult, broadcast to all double-slots of just-freed ymm8 */\
		"movq		%[__cy],%%rbx		\n\t"\
		"vmovaps	(%%rbx),%%ymm12		\n\t"/* ymm12 = cy_in */\
		/* LOACC wts-data occupy 32 ymm-sized slots starting at (vec_dbl*)half_arr + 96 : */\
		/* half_arr + 16*[0,1,2,3] = [wts_mult,inv_mult,base,baseinv] */\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"vmovaps -0x40(%%rdi),%%ymm14	\n\t"/* ymm14 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)	\n\t"\
	"movq	%[__bjmod_0],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm15			\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm15. */\
	"movq	%[__sse_sw],%%rsi			\n\t"\
	"vmovaps	(%%rsi),%%xmm10			\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rbx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rbx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		/* This 4-way vector-carry macro uses only the even-indexed 16 of said slots, having byte offsets == 0 (mod 0x40) */\
		"vmovaps	0xc40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xc00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0,%%ymm0	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] ... store product in ymm10, since still need cy in ymm12 */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t"/* Store A.re to free up a register */\
		"vmovaps	%%ymm2 ,0xc00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xc40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"/* bjmod[0:3] += bw  */\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x20(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movq	%[__sse_sw],%%rsi		\n\t"\
	"vmovaps	(%%rsi),	%%xmm10		\n\t"/* sw[0:3] */\
	"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"/* sw[0:3] - bjmod[0:3] */\
	"vmovmskps	%%xmm10,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xcc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xc80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x20(%%rax)		\n\t"/* Store A.im to free up a register */\
		"vmovaps	%%ymm2 ,0xc80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xcc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x40(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xd40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xd00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x40(%%rax)		\n\t"/* Store B.re to free up a register */\
		"vmovaps	%%ymm2 ,0xd00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xd40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x60(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xdc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xd80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x60(%%rax)		\n\t"/* Store B.im to free up a register */\
		"vmovaps	%%ymm2 ,0xd80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xdc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE2~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0x80(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xe40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xe00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0x80(%%rax)		\n\t"/* Store C.re to free up a register */\
		"vmovaps	%%ymm2 ,0xe00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xe40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM2~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xa0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xec0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xe80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xa0(%%rax)		\n\t"/* Store C.im to free up a register */\
		"vmovaps	%%ymm2 ,0xe80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xec0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for next set [RE3~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xc0(%%rax),%%ymm0 	\n\t"/* Load data */\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xf40(%%rdi),%%ymm4 	\n\t"/* wi_re for our 4 independent carry-chains */\
		"vmovaps	0xf00(%%rdi),%%ymm2 	\n\t"/* wt_re */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* x *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_re *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_re *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xc0(%%rax)		\n\t"/* Store D.re to free up a register */\
		"vmovaps	%%ymm2 ,0xf00(%%rdi)		\n\t"/* Store wt_re */\
		"vmovaps	%%ymm4 ,0xf40(%%rdi)		\n\t"/* Store wi_re */\
		/* Get ready for next set [IM3~] : */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm0: */\
	/**********************************/\
	"vmovaps	0xe0(%%rax),%%ymm0 	\n\t"/* Load data */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm10		\n\t"\
		"vpsubd		%%xmm15,%%xmm10,%%xmm10	\n\t"\
		"vmovmskps	%%xmm10,	%%rsi	\n\t"\
		"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"vmovaps	0xfc0(%%rdi),%%ymm4 	\n\t"/* wi_im for our 4 independent carry-chains */\
		"vmovaps	0xf80(%%rdi),%%ymm2 	\n\t"/* wt_im */\
		"vmulpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t"/* y *= wtinv */\
		"vmovaps	%%ymm0 ,%%ymm10			\n\t"/* temp = y */\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"/* temp = DNINT(y) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* y - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(y-temp) */\
		"vmulpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm12,%%ymm10,%%ymm10	\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm14,%%ymm0,%%ymm14	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm12	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm12,%%ymm12			\n\t"/* cy_out */\
		/* y = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm12,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* y = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm2 ,%%ymm0 ,%%ymm0 		\n\t"/* y *= wt */\
		/* Update and store weights: */\
		"vmovaps	 0xbe0(%%rdi),%%ymm10		\n\t"/* inv_mult[1] */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"vcmppd		$1,%%ymm2 ,%%ymm10,%%ymm10	\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"vmovmskpd	%%ymm10,%%rdx			\n\t"/* Extract cmp-results into 4-bit signmask */\
		"shlq			$5,%%rdx			\n\t"/* ...and mpy by ymm bytewidth. */\
		"leaq	(%%rdi,%%rdx),%%rdx			\n\t"/* address = half_arr + i */\
		"vmulpd	 0x800(%%rdx),%%ymm2 ,%%ymm2 	\n\t"/* wt_im *= wts_mult[i] */\
		"vmulpd	 0xa00(%%rdx),%%ymm4 ,%%ymm4 	\n\t"/* wi_im *= inv_mult[i] */\
		"vmovaps	%%ymm0 ,0xe0(%%rax)		\n\t"/* Store D.im to free up a register */\
		"vmovaps	%%ymm2 ,0xf80(%%rdi)		\n\t"/* Store wt_im */\
		"vmovaps	%%ymm4 ,0xfc0(%%rdi)		\n\t"/* Store wi_im */\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_n]	,%%rbx			\n\t"\
		"vmovaps	(%%rbx)	,%%xmm2			\n\t"\
		"movq	%[__sse_bw]	,%%rcx			\n\t"\
		"vpaddd		(%%rcx)	,%%xmm15,%%xmm15	\n\t"\
		"vmovaps	%%xmm15	,%%xmm10			\n\t"\
		"vpcmpgtd	%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpand		%%xmm2	,%%xmm10,%%xmm10	\n\t"\
		"vpsubd		%%xmm10	,%%xmm15,%%xmm15	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rbx		\n\t"\
		"vmovaps	%%xmm15,(%%rbx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm12,(%%rbx)	\n\t"/* cy_out = ymm12 */\
		/* Store maxerr: */\
		"vmovaps	%%ymm14,-0x40(%%rdi)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm0 ,    (%%rax)		\n\t		vmovaps	%%ymm1 ,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm2 ,0x40(%%rax)		\n\t		vmovaps	%%ymm3 ,0x60(%%rax)		\n\t"\
		"vmovaps	%%ymm6 ,0xc0(%%rax)		\n\t		vmovaps	%%ymm7 ,0xe0(%%rax)		\n\t"\
		"vmovaps	%%ymm4 ,0x80(%%rax)		\n\t		vmovaps	%%ymm5 ,0xa0(%%rax)		\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm12","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

#endif

	// Mar 2016: Bumped mem-offsets for broadcast-loads from 0x800-based to 0x1000-based for compatibility with new LOACC-carry memory layout
	#define AVX_cmplx_carry_norm_errcheck_X4(Xdata,XwtA,XwtB,XwtC,Xcy,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
	"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"movq		%[__data],%%rax		\n\t"\
	/* Transpose uses algo [1b] in util.c:test_simd_transpose_4x4(), which is ~4 cycles faster than algo [1a]. */\
	/* 4-way transpose of inputs (Re, Im parts separately): Inputs from r0/1,2/3,4/5.6/7. Outputs into ymm0-7: */\
	/* Real parts use ymm0,2,4,6, ymm8 as tmp-reg:					Imag parts use ymm1,3,5,7, ymm9 as tm-reg: */\
	"vmovaps	    (%%rax),%%xmm0				\n\t	vmovaps	0x20(%%rax),%%xmm1					\n\t"/* r0.lo = 0,1,-,- */\
	"vmovaps	0x40(%%rax),%%xmm8				\n\t	vmovaps	0x60(%%rax),%%xmm9					\n\t"/* r1.lo = 4,5,-,- */\
	"vinsertf128 $1,0x80(%%rax),%%ymm0,%%ymm0	\n\t	vinsertf128 $1,0xa0(%%rax),%%ymm1,%%ymm1	\n\t"/* r0|r2.lo = 0,1,8,9 */\
	"vinsertf128 $1,0xc0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xe0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.lo = 4,5,c,d */\
	"vshufpd	$15,%%ymm8,%%ymm0,%%ymm2		\n\t	vshufpd	$15,%%ymm9,%%ymm1,%%ymm3			\n\t"/* Row 1 = 1,5,9,d */\
	"vshufpd	$0 ,%%ymm8,%%ymm0,%%ymm0		\n\t	vshufpd	$0 ,%%ymm9,%%ymm1,%%ymm1			\n\t"/* Row 0 = 0,4,8,c */\
	"vmovaps	0x10(%%rax),%%xmm4				\n\t	vmovaps	0x30(%%rax),%%xmm5					\n\t"/* r0.hi = 2,3,-,- */\
	"vmovaps	0x50(%%rax),%%xmm8				\n\t	vmovaps	0x70(%%rax),%%xmm9					\n\t"/* r1.hi = 6,7,-,- */\
	"vinsertf128 $1,0x90(%%rax),%%ymm4,%%ymm4	\n\t	vinsertf128 $1,0xb0(%%rax),%%ymm5,%%ymm5	\n\t"/* r0|r2.hi = 2,3,a,b */\
	"vinsertf128 $1,0xd0(%%rax),%%ymm8,%%ymm8	\n\t	vinsertf128 $1,0xf0(%%rax),%%ymm9,%%ymm9	\n\t"/* r1|r3.hi = 6,7,e,f */\
	"vshufpd	$15,%%ymm8,%%ymm4,%%ymm6		\n\t	vshufpd	$15,%%ymm9,%%ymm5,%%ymm7			\n\t"/* Row 3 = 3,7,b,f */\
	"vshufpd	$0 ,%%ymm8,%%ymm4,%%ymm4		\n\t	vshufpd	$0 ,%%ymm9,%%ymm5,%%ymm5			\n\t"/* Row 2 = 2,6,a,e */\
		/* Move ymm7 to mem to free up a vreg: */\
		"vmovaps	%%ymm7,0x0e0(%%rax)			\n\t"\
		"movq	%[__prp_mult]	,%%rbx	\n\t"\
		"vbroadcastsd	(%%rbx),%%ymm7	\n\t"/* prp_mult, broadcast to all double-slots of ymm7 */\
		/* Won't need main-array again until output transpose, so re-use rax for half_arr */\
		"movq	 %[__half_arr],%%rax	\n\t"\
		/* half_arr + 16*[0,1,2,3] = [wt,wt_inv,base,baseinv] */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	(%%rbx),%%ymm14	\n\t"/* ymm14 = cy_in */\
		"vmovaps	-0x40(%%rax),%%ymm15	\n\t"/* ymm15 = maxerr */\
	/**********************************/\
	/* Do A.re-quartet: Data in ymm0: */\
	/**********************************/\
	"prefetcht0	(%%r14)		\n\t"\
		"movq	%[__bjmod_0],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm8 		\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm8. */\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"/* sw[0:3] */\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
	"movslq	%[__i]	,%%rcx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rcx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x0(%%rcx)	,%%xmm9 	\n\t"/* n_minus_sil in low 32 bits of xmm9  */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"/* Broadcast low 32 bits of xmm9  to all 4 slots of xmm9  */\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x0(%%rdx)	,%%xmm10		\n\t"/* sinwt in low 32 bits of xmm10 */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"/* Broadcast low 32 bits of xmm10 to all 4 slots of xmm10 */\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"/* xmm11 = bjmod[0:3] copy */\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"/* bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm11,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__half_arr],%%rdi	\n\t"\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"vmovaps	     (%%rax),%%ymm12 	\n\t"/* wtA[j  ]; ebx FREE */\
		"vmovaps	-0x10(%%rbx),%%ymm13	\n\t"/* wtB[j-1] - It may not look like it but this is in fact an aligned load */\
		/* reverse-running indexing used for inv-wts really means we need to reverse ordering of 4 doubles d0-3 in ymm13 */\
		"vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1000(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1008(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"/* wt   =wtA*wtl */\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"/* wtinv=wtB*wtn */\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm0,%%ymm0		\n\t"/* x *= wtinv; ymm10 FREE */\
		"vmovaps	%%ymm0,%%ymm10		\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10	\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm0,%%ymm0	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm0,%%ymm0	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm0,%%ymm15	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm0				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm10,%%ymm14			\n\t"/* cy_out */\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm0 ,%%ymm0 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm9 ,%%ymm0 ,%%ymm0 		\n\t"/* x *= wt */\
		/* Get ready for next set [IM0~] by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do A.im-quartet: Data in ymm1: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"/* sw[0:3] */\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask <i3|i2|i1|i0>; idxs into base/inv table */\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x0(%%rcx)	,%%xmm9 	\n\t"/* n_minus_sil in low 32 bits of xmm9  */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"/* Broadcast low 32 bits of xmm9  to all 4 slots of xmm9  */\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask <m3|m2|m1|m0>; idxs into base/inv tables -> byte[2] of ecx... */\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x0(%%rdx)	,%%xmm10		\n\t"/* sinwt in low 32 bits of xmm10 */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"/* Broadcast low 32 bits of xmm10 to all 4 slots of xmm10 */\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"/* xmm11 = bjmod[0:3] copy */\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"/* bjmod[0:3] - sinwt */\
		"vmovmskps	%%xmm11,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask <n3|n2|n1|n0>; idxs into base/inv tables -> byte[1] of edx... */\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rcx	\n\t"/* m0123 << 5 (= lshift to give ptr offset for ymm-size data */\
	"shlq	$5,%%rdx	\n\t"/* n0123 << 5 (= lshift to give ptr offset for ymm-size data */\
		"\n\t"\
		"movq	%[__wtC]	,%%rbx		\n\t"/* wtA unchanged; wtB == wtC for remaining 7 of 8 sets of carries */\
		"vmovaps	-0x10(%%rbx),%%ymm13	\n\t"/* wtC[j-1]; load doubles from rcx+[-0x10,-0x08, 0, +0x08] - note this is an 'aligned load in disguise' */\
		"vshufpd	$5,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[0123] -> d[1032] */\
		"vperm2f128 $1,%%ymm13,%%ymm13,%%ymm13	\n\t"/* d[1032] -> d[3210] */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		/* SSE2 version has double-copies in wtl/wtn ... AVX replaces redundant-data loads with load-with-broadcast: */\
		"vbroadcastsd 0x1010(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1018(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"/* wt   =wtA*wtlp1 */\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"/* wtinv=wtC*wtnm1 */\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 	\n\t"/* wt    *= one_half[m0123] */\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10	\n\t"/* wtinv *= one_half[16+n0123] */\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm1,%%ymm1		\n\t"/* x *= wtinv; ymm10 FREE */\
		"vmovaps	%%ymm1,%%ymm10		\n\t"/* temp = x */\
		"vroundpd	$0,%%ymm10,%%ymm10	\n\t"/* temp = DNINT(x) */\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx	\n\t"\
		"vsubpd		%%ymm10,%%ymm1,%%ymm1	\n\t"/* x - temp */\
		"vandpd		(%%rbx),%%ymm1,%%ymm1	\n\t"/* frac = fabs(x-temp) */\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm1,%%ymm15	\n\t"/* if(frac > maxerr) maxerr=frac */\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm1				\n\t"/* cpy temp */\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10	\n\t"/* temp*baseinv[i0123] */\
		"vroundpd	$0,%%ymm10,%%ymm14			\n\t"/* cy_out */\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm1 ,%%ymm1 		\n\t"/* x = (temp-cy*base[i0123]) */\
		"vmulpd		%%ymm9 ,%%ymm1 ,%%ymm1 		\n\t"/* x *= wt */\
		/* Get ready for next set [RE1~] by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.re-quartet: Data in ymm2: */\
	/**********************************/\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x4(%%rcx)	,%%xmm9 	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x4(%%rdx)	,%%xmm10	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1020(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1028(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vmovaps	%%ymm2,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm2,%%ymm2	\n\t"\
		"vandpd		(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm2,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm2			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm2 ,%%ymm2 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm2 ,%%ymm2 	\n\t"\
		/* Get ready for next set [IM1~] by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do B.im-quartet: Data in ymm3: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x4(%%rcx)	,%%xmm9 	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x4(%%rdx)	,%%xmm10	\n\t"/* .d1 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1030(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1038(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%%ymm3,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vandpd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm3,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm3			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm3 ,%%ymm3 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm3 ,%%ymm3 	\n\t"\
		/* Get ready for next set [RE2~] by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.re-quartet: Data in ymm4: */\
	/**********************************/\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0x8(%%rcx)	,%%xmm9 	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0x8(%%rdx)	,%%xmm10	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1040(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1048(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm4,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm4,%%ymm4	\n\t"\
		"vandpd		(%%rbx),%%ymm4,%%ymm4	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm4,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm4			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm4 ,%%ymm4 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm4 ,%%ymm4 	\n\t"\
		/* Get ready for next set [IM2~] by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do C.im-quartet: Data in ymm5: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0x8(%%rcx)	,%%xmm9 	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0x8(%%rdx)	,%%xmm10	\n\t"/* .d2 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1050(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1058(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm5,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm5,%%ymm5	\n\t"\
		"vandpd		(%%rbx),%%ymm5,%%ymm5	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm5,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm5			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm5 ,%%ymm5 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm5 ,%%ymm5 	\n\t"\
		/* Get ready for next set [RE3~] : by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.re-quartet: Data in ymm6: */\
	/**********************************/\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_sil],%%rcx	\n\t"\
		"vmovd	0xC(%%rcx)	,%%xmm9 	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwt]	,%%rdx		\n\t"\
		"vmovd	0xC(%%rdx)	,%%xmm10	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1060(%%rdi),%%ymm9 	\n\t"/* wtl */\
		"vbroadcastsd 0x1068(%%rdi),%%ymm10	\n\t"/* wtn */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm6,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm6,%%ymm6	\n\t"\
		"vandpd		(%%rbx),%%ymm6,%%ymm6	\n\t"\
		"vmulpd		%%ymm7 ,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm6,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm6			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm6 ,%%ymm6 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm6 ,%%ymm6 	\n\t"\
		/* Get ready for next set [IM3~] : by computing bjmod[0:3] += bw (mod n): */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
	/**********************************/\
	/* Do D.im-quartet: Data in ymm7: */\
	/**********************************/\
		"movq	%[__sse_sw],%%rsi		\n\t"\
		"vmovaps	(%%rsi),	%%xmm11		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm11,%%xmm11	\n\t"\
		"vmovmskps	%%xmm11,	%%rsi	\n\t"\
		"\n\t"\
		"movq	%[__n_minus_silp1],%%rcx	\n\t"\
		"vmovd	0xC(%%rcx)	,%%xmm9 	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm9 ,%%xmm9 		\n\t"\
		"vpsubd		%%xmm8 ,%%xmm9 ,%%xmm9 		\n\t"\
		"vmovmskps	%%xmm9 ,%%rcx		\n\t"\
		"\n\t"\
		"movq	%[__sinwtm1]	,%%rdx		\n\t"\
		"vmovd	0xC(%%rdx)	,%%xmm10	\n\t"/* .d3 term of index quartet */\
		"vpshufd	$0,	%%xmm10,%%xmm10		\n\t"\
		"vmovaps		%%xmm8 ,%%xmm11		\n\t"\
		"vpsubd		%%xmm10,%%xmm11,%%xmm11		\n\t"\
		"vmovmskps	%%xmm11,%%rdx		\n\t"\
	/* reload ymm7 from mem - this overwrites the prp_mult data, which is why we first copy the latter to just-fred ymm11: */\
	"movq		%[__data],%%rax			\n\t"\
	"vmovaps	%%ymm7,%%ymm11			\n\t"\
	"vmovaps	0x0e0(%%rax),%%ymm7		\n\t"\
		"\n\t"\
	"shlq	$5,%%rsi	\n\t"/* i0123 */\
	"shlq	$5,%%rcx	\n\t"/* m0123 */\
	"shlq	$5,%%rdx	\n\t"/* n0123 */\
		"\n\t"\
		"addq	%%rdi,%%rcx		\n\t"\
		"addq	%%rdi,%%rdx		\n\t"\
		"vbroadcastsd 0x1070(%%rdi),%%ymm9 	\n\t"/* wtlp1 */\
		"vbroadcastsd 0x1078(%%rdi),%%ymm10	\n\t"/* wtnm1 */\
		"vmulpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	     (%%rcx),%%ymm9 ,%%ymm9 \n\t"\
		"vmulpd	0x200(%%rdx),%%ymm10,%%ymm10\n\t"\
		"\n\t"\
		"vmulpd		%%ymm10,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm7,%%ymm10			\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm10		\n\t"\
		"\n\t"\
		"movq	%[__sign_mask],%%rbx		\n\t"\
		"vsubpd		%%ymm10,%%ymm7,%%ymm7	\n\t"\
		"vandpd		(%%rbx),%%ymm7,%%ymm7	\n\t"\
		"vmulpd		%%ymm11,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult[now in ymm11] */\
		"vaddpd		%%ymm14,%%ymm10,%%ymm10		\n\t"/* temp = temp*prp_mult + cy */\
		"vmaxpd		%%ymm15,%%ymm7,%%ymm15	\n\t"\
		/* cy   = DNINT(temp*baseinv[i1]): */\
		"addq	%%rdi,%%rsi		\n\t"\
		"vmovaps	%%ymm10	,%%ymm7			\n\t"\
		"vmulpd	0x600(%%rsi),%%ymm10,%%ymm10\n\t"\
		"vroundpd	$0,%%ymm10,%%ymm14		\n\t"\
		"vmovaps	%%ymm14,%%ymm10				\n\t"/* cy = cpy cy_out */\
		/* x = (temp-cy*base[i1])*wt: */\
		"vmulpd	0x400(%%rsi),%%ymm10,%%ymm10	\n\t"/* cy*base[i0123] */\
		"vsubpd		%%ymm10,%%ymm7 ,%%ymm7 	\n\t"\
		"vmulpd		%%ymm9 ,%%ymm7 ,%%ymm7 	\n\t"\
		/* Update wts-array pointers in preparation for next call of the macro: */\
		"movq	%[__wtA]	,%%rax		\n\t"\
		"movq	%[__wtB]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"\
		"addq	$0x20	,%%rax			\n\t"/* add0 += 4 */\
		"subq	$0x20	,%%rbx			\n\t"/* add1 -= 4 */\
		"subq	$0x20	,%%rcx			\n\t"/* add2 -= 4 */\
		"movq	%%rax	,%[__wtA]		\n\t"\
		"movq	%%rbx	,%[__wtB]		\n\t"\
		"movq	%%rcx	,%[__wtC]		\n\t"\
		"\n\t"\
		/* Get ready for store of final-updated bjmod[0:3] values: */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"vmovaps		(%%rbx)	,%%xmm10	\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"vpaddd		(%%rax),%%xmm8 ,%%xmm8 	\n\t"\
		"vmovaps	%%xmm8 ,%%xmm9 		\n\t"\
		"vpcmpgtd	%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpand		%%xmm10,%%xmm9 ,%%xmm9 	\n\t"\
		"vpsubd		%%xmm9 ,%%xmm8 ,%%xmm8 	\n\t"\
		"\n\t"\
		/* Store bjmodn index quartet: */\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"vmovaps	%%xmm8,(%%rcx)			\n\t"\
		/* Store cy_out: */\
		"movq		%[__cy],%%rbx	\n\t"\
		"vmovaps	%%ymm14,(%%rbx)	\n\t"/* ymm14 = cy_in */\
		/* Store maxerr: */\
		"movq		%[__half_arr],%%rdx		\n\t"\
		"vmovaps	%%ymm15,-0x40(%%rdx)	\n\t"\
	/* 4-way transpose of outputs (Re, Im parts separately): Inputs from ymm0-7. Outputs into r0/1,2/3,4/5.6/7: */	\
	/* Because default inputs for our 4 x 4 transpose macro (e.g. the one used at start of this carry macro) */\
	/* are into ymm4/2/8/2, munge inputs into that order, resolving name-conflicts via use of the now-available ymm8-15 for outputs: */\
		"movq		%[__data],%%rax			\n\t"\
		"vshufpd	$15,%%ymm2,%%ymm0,%%ymm10					\n\t		vshufpd	$15,%%ymm3,%%ymm1,%%ymm11						\n\t"\
		"vshufpd	$0 ,%%ymm2,%%ymm0,%%ymm0					\n\t		vshufpd	$0 ,%%ymm3,%%ymm1,%%ymm1						\n\t"\
		"vshufpd	$15,%%ymm6,%%ymm4,%%ymm12					\n\t		vshufpd	$15,%%ymm7,%%ymm5,%%ymm13						\n\t"\
		"vshufpd	$0 ,%%ymm6,%%ymm4,%%ymm4					\n\t		vshufpd	$0 ,%%ymm7,%%ymm5,%%ymm5						\n\t"\
		"vperm2f128 $32,%%ymm12,%%ymm10,%%ymm2 		/* Re B	*/	\n\t		vperm2f128 $32,%%ymm13,%%ymm11,%%ymm3		/* Im B	*/	\n\t"\
		"vperm2f128 $49,%%ymm12,%%ymm10,%%ymm10		/* Re D	*/	\n\t		vperm2f128 $49,%%ymm13,%%ymm11,%%ymm11		/* Im D	*/	\n\t"\
		"vperm2f128 $32,%%ymm4 ,%%ymm0 ,%%ymm12		/* Re A	*/	\n\t		vperm2f128 $32,%%ymm5 ,%%ymm1 ,%%ymm13 		/* Im A	*/	\n\t"\
		"vperm2f128 $49,%%ymm4 ,%%ymm0 ,%%ymm0 		/* Re C	*/	\n\t		vperm2f128 $49,%%ymm5 ,%%ymm1 ,%%ymm1		/* Im C	*/	\n\t"\
		/* And write 'em back to memory: */\
		"vmovaps	%%ymm12,     (%%rax)						\n\t		vmovaps	%%ymm13,0x020(%%rax)				\n\t"\
		"vmovaps	%%ymm2 ,0x040(%%rax)						\n\t		vmovaps	%%ymm3 ,0x060(%%rax)				\n\t"\
		"vmovaps	%%ymm0 ,0x080(%%rax)						\n\t		vmovaps	%%ymm1 ,0x0a0(%%rax)				\n\t"\
		"vmovaps	%%ymm10,0x0c0(%%rax)						\n\t		vmovaps	%%ymm11,0x0e0(%%rax)				\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__cy]		"m" (Xcy)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"/* Clobbered registers */\
	);\
	}

#elif !defined(USE_ARM_V8_SIMD)	// 64-bit SSE2:

	/********* Packed 32-bit-int version of SSE2_cmplx_carry_norm_pow2_errcheck0_2x:***********/
	#define SSE2_cmplx_carry_norm_pow2_errcheck1_2B(Xdata,XwtA,XwtB,XwtC,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
		"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"prefetcht0	(%%r14)		\n\t"\
	/***************Unpack the data:*************************/\
		"movq	%[__data]	,%%rax	\n\t"\
		"movaps		    (%%rax)	,%%xmm1	\n\t	movaps		0x40(%%rax)	,%%xmm5	\n\t"/* r1, this is the active  xmm register */\
		"movaps		    (%%rax)	,%%xmm2	\n\t	movaps		0x40(%%rax)	,%%xmm6	\n\t"/* r1, this is the scratch xmm register */\
		"unpcklpd	0x20(%%rax)	,%%xmm1	\n\t	unpcklpd	0x60(%%rax)	,%%xmm5	\n\t"/* r1 -x- r3 (lo halves) ==> R0~ */\
		"unpckhpd	0x20(%%rax)	,%%xmm2	\n\t	unpckhpd	0x60(%%rax)	,%%xmm6	\n\t"/* r1 -x- r3 (hi halves) ==> R1~ */\
		"movaps		%%xmm2, 0x20(%%rax)	\n\t	movaps		%%xmm6, 0x60(%%rax)	\n\t"/* Tmp store R1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
		"\n\t"\
		"movaps		0x10(%%rax)	,%%xmm2	\n\t	movaps		0x50(%%rax)	,%%xmm6	\n\t"\
		"movaps		0x10(%%rax)	,%%xmm3	\n\t	movaps		0x50(%%rax)	,%%xmm7	\n\t"\
		"unpcklpd	0x30(%%rax)	,%%xmm2	\n\t	unpcklpd	0x70(%%rax)	,%%xmm6	\n\t"/* r2 -x- r4 (lo halves) ==> I0~ */\
		"unpckhpd	0x30(%%rax)	,%%xmm3	\n\t	unpckhpd	0x70(%%rax)	,%%xmm7	\n\t"/* r2 -x- r4 (hi halves) ==> I1~ */\
		"movaps		%%xmm2, 0x10(%%rax)	\n\t	movaps		%%xmm6, 0x50(%%rax)	\n\t"/* Tmp store I0~ until needed by imaginary-part-processing section */\
		"movaps		%%xmm3, 0x30(%%rax)	\n\t	movaps		%%xmm7, 0x70(%%rax)	\n\t"/* Tmp store I1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
	/* Active data in xmm1,5 here - avoid using those registers in index computation. */\
	/**********************************************/\
	/*          Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
		"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3] */\
		"movq	%[__sse_sw]	,	%%rbx	\n\t"\
		"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
		"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
	"movslq	%[__i]	,%%rcx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rcx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
		"shlq	$24		,%%rsi			\n\t"/* <i3|i2|i1|i0>; Packed indices into base,base_inv tables; move into leftmost byte[3] */\
		"movaps		%%xmm0	,%%xmm7		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil],%%rcx	\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"/* n_minus_sil in low 32 bits of xmm2 */\
		"pshufd	$0,	%%xmm2	,%%xmm2		\n\t"/* Broadcast low 32 bits of xmm2 to all 4 slots of xmm2 */\
		"psubd		%%xmm0	,%%xmm2		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"movmskps	%%xmm2	,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$16		,%%rcx			\n\t"/* <m3|m2|m1|m0>; Packed indices into base,base_inv tables; move into leftmost byte[2] of ecx... */\
		"addq	%%rcx	,%%rsi			\n\t"/* ....and fold into esi. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"/* sinwt in low 32 bits of xmm3 */\
		"pshufd	$0,	%%xmm3	,%%xmm3		\n\t"/* Broadcast low 32 bits of xmm3 to all 4 slots of xmm3 */\
		"psubd		%%xmm3	,%%xmm7		\n\t"/* bjmod[0:3] - sinwt */\
		"movmskps	%%xmm7	,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$8		,%%rdx			\n\t"/* <n3|n2|n1|n0>; Packed indices into base,base_inv tables; move into leftmost byte[1] of edx... */\
		"addq	%%rdx	,%%rsi			\n\t"/* ....and fold into esi. */\
		"\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		-0x10(%%rax),%%xmm4	\n\t"/* sse2_rnd */\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm6	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"/* wtC[j-1]; ecx FREE */\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"/* [NOTE: movhpd/movlpd preferable to movupd/shufpd] */\
	/* The commented-out-below variant of the above movhpd/movlpd combo is no faster in SSE2 mode, but generalizes better to AVX: */\
	/*	movaps		     (%%rcx),%%xmm3	\n\t	movaps		-0x10(%%rcx),%%xmm7	\n\t	*/\
	/*	shufpd		$1 ,%%xmm3,%%xmm3	\n\t	shufpd		$1 ,%%xmm7,%%xmm7	\n\t	*/\
		"\n\t"\
		"subq	$0x20	,%%rcx			\n\t"/* add1 -= 4 */\
		"movq	%%rcx	,%[__wtB]		\n\t"\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtl,wtn address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm6	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x190(%%rax),%%xmm3	\n\t	mulpd		0x190(%%rax),%%xmm7	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm6	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm7	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wtinv; xmm3,xmm7 FREE */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* i0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
		"mulpd		0xc0(%%rdi),%%xmm3	\n\t	mulpd		0xc0(%%rbx),%%xmm7	\n\t"/* temp*baseinv[i1] */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps		%%xmm3	,(%%rcx)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi),	%%xmm3	\n\t	mulpd	 0x80(%%rbx),	%%xmm7	\n\t"/* cy*base[i1] */\
		"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i1]) */\
		"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt */\
		"movaps		%%xmm1	,    (%%rax)\n\t	movaps		%%xmm5	,0x40(%%rax)\n\t"/* store x */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*          Imaginary parts                   */\
	/**********************************************/\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw]	,%%rdx		\n\t"\
		"movaps	(%%rdx)	,%%xmm1			\n\t"/* sw[0:3] */\
		"psubd	%%xmm0	,%%xmm1			\n\t"/* sw[0:3] - bjmod[0:3] */\
		"movmskps	%%xmm1	,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$24	,%%rsi				\n\t"/* <i3|i2|i1|i0>; Packed indices into base,base_inv tables; move into leftmost byte[3] */\
		"movaps	%%xmm0	,%%xmm1			\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_silp1],%%rcx\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"/* n_minus_silp1 in low 32 bits of xmm2 */\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t"/* Broadcast low 32 bits of xmm2 to all 4 slots of xmm2 */\
		"psubd	%%xmm0	,%%xmm2			\n\t"/* n_minus_silp1 - bjmod[0:3] */\
		"movmskps	%%xmm2	,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$16	,%%rcx				\n\t"/* <m3|m2|m1|m0>; Packed indices into base,base_inv tables; move into leftmost byte[2] of ecx... */\
		"addq	%%rcx	,%%rsi			\n\t"/* ....and fold into esi. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"/* sinwtm1 in low 32 bits of xmm3 */\
		"pshufd	$0	,%%xmm3	,%%xmm3		\n\t"/* Broadcast low 32 bits of xmm3 to all 4 slots of xmm3 */\
		"psubd	%%xmm3	,%%xmm1			\n\t"/* bjmod[0:3] - sinwtm1 */\
		"movmskps	%%xmm1	,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$8	,%%rdx				\n\t"/* <n3|n2|n1|n0>; Packed indices into base,base_inv tables; move into leftmost byte[1] of edx... */\
		"addq	%%rdx	,%%rsi			\n\t"/* ....and fold into esi. */\
		"movq	%[__data]	,%%rax		\n\t"\
		"movaps	 0x10(%%rax)	,%%xmm1	\n\t	movaps	 0x50(%%rax)	,%%xmm5	\n\t"/* I1~ */\
		/* Don't explicitly load address of sse2_rnd, since we know it's in [half_arr - 0x10]. */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"\n\t"\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm6	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"/* wtC[j-1]; ecx FREE */\
	/*	movaps		     (%%rcx),%%xmm3	\n\t	movaps		-0x10(%%rcx),%%xmm7	\n\t	*/\
	/*	shufpd		$1 ,%%xmm3,%%xmm3	\n\t	shufpd		$1 ,%%xmm7,%%xmm7	\n\t	*/\
		"\n\t"\
		"addq	$0x20	,%%rbx			\n\t"/* add0 += 4 */\
		"subq	$0x20	,%%rcx			\n\t"/* add2 -= 4 */\
		"movq	%%rbx	,%[__wtA]		\n\t"\
		"movq	%%rcx	,%[__wtC]		\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtlp1,wtnm1 address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd	 0x1a0(%%rax)	,%%xmm2	\n\t	mulpd	 0x1a0(%%rax)	,%%xmm6	\n\t"/* wt   =wtA*wtlp1 */\
		"mulpd	 0x1b0(%%rax)	,%%xmm3	\n\t	mulpd	 0x1b0(%%rax)	,%%xmm7	\n\t"/* wtinv=wtC*wtnm1 */\
		"mulpd	      (%%rdi)	,%%xmm2	\n\t	mulpd	      (%%rbx)	,%%xmm6	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd	 0x040(%%rdx)	,%%xmm3	\n\t	mulpd	 0x040(%%rcx)	,%%xmm7	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		/* sign_mask still in xmm8: */\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* i0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps	%%xmm3,	%%xmm1			\n\t	movaps	%%xmm7,	%%xmm5			\n\t"/* cpy temp */\
		"mulpd	 0xc0(%%rdi),%%xmm3		\n\t	mulpd	 0xc0(%%rbx),%%xmm7		\n\t"/* temp*baseinv[i1] */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps	%%xmm3	,(%%rcx)		\n\t	movaps	%%xmm7	,(%%rdx)		\n\t"/* store cy_out */\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"/* cy*base[i1] */\
		"subpd	%%xmm3	,%%xmm1			\n\t	subpd	%%xmm7	,%%xmm5			\n\t"/* y = (temp-cy*base[i1]) */\
		"mulpd	%%xmm2	,%%xmm1			\n\t	mulpd	%%xmm6	,%%xmm5			\n\t"/* y*= wt */\
		"movaps	%%xmm1	, 0x10(%%rax)	\n\t	movaps	%%xmm5	, 0x50(%%rax)	\n\t"/* store y */\
		/* Get ready for next set [RE1~, IM1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd	(%%rax)	,%%xmm0			\n\t"/* bjmod[0:3] += bw  */\
		"pand	(%%rbx)	,%%xmm0			\n\t"/* bjmod[0:3] &= nm1 */\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"movaps	%%xmm0,(%%rcx)			\n\t"/* Write bjmod[0:3] */\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm10"	/* Clobbered registers */\
	);\
	}

	/********* Packed 32-bit-int version of SSE2_cmplx_carry_norm_pow2_errcheck2_2x:***********/
	#define SSE2_cmplx_carry_norm_pow2_errcheck2_2B(Xdata,XwtA,XwtB,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
		"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
	/**********************************************/\
	/*          Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
		"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3] */\
		"movq	%[__sse_sw]	,	%%rbx	\n\t"\
		"movaps		(%%rbx)	,	%%xmm1	\n\t"/* sw[0:3] */\
		"psubd		%%xmm0	,	%%xmm1	\n\t"/* sw[0:3] - bjmod[0:3] */\
		"movmskps	%%xmm1	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$24		,%%rsi			\n\t"/* <i3|i2|i1|i0>; Packed indices into base,base_inv tables; move into leftmost byte[3] */\
		"movaps		%%xmm0	,%%xmm1		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil],%%rcx	\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"/* n_minus_sil in low 32 bits of xmm2 */\
		"pshufd	$0,	%%xmm2	,%%xmm2		\n\t"/* Broadcast low 32 bits of xmm2 to all 4 slots of xmm2 */\
		"psubd		%%xmm0	,%%xmm2		\n\t"/* n_minus_sil - bjmod[0:3] */\
		"movmskps	%%xmm2	,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$16		,%%rcx			\n\t"/* <m3|m2|m1|m0>; Packed indices into base,base_inv tables; move into leftmost byte[2] of ecx... */\
		"addq	%%rcx	,%%rsi			\n\t"/* ....and fold into esi. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"/* sinwt in low 32 bits of xmm3 */\
		"pshufd	$0,	%%xmm3	,%%xmm3		\n\t"/* Broadcast low 32 bits of xmm3 to all 4 slots of xmm3 */\
		"psubd		%%xmm3	,%%xmm1		\n\t"/* bjmod[0:3] - sinwt */\
		"movmskps	%%xmm1	,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$8		,%%rdx			\n\t"/* <n3|n2|n1|n0>; Packed indices into base,base_inv tables; move into leftmost byte[1] of edx... */\
		"addq	%%rdx	,%%rsi			\n\t"/* ....and fold into esi. */\
		"movq	%[__data],%%rax			\n\t"\
		"movaps	 0x20(%%rax),%%xmm1		\n\t	movaps		 0x60(%%rax),%%xmm5\n\t"/* R1~ */\
		/* Don't explicitly load address of sse2_rnd, since we know it's in [half_arr - 0x10]. */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		-0x10(%%rax),%%xmm4	\n\t"/* sse2_rnd */\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm6	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"/* wtC[j-1]; ecx FREE */\
	/*	movaps		     (%%rcx),%%xmm3	\n\t	movaps		-0x10(%%rcx),%%xmm7	\n\t	*/\
	/*	shufpd		$1 ,%%xmm3,%%xmm3	\n\t	shufpd		$1 ,%%xmm7,%%xmm7	\n\t	*/\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtl,wtn address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm6	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x190(%%rax),%%xmm3	\n\t	mulpd		0x190(%%rax),%%xmm7	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm6	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm7	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x = x*wtinv; xmm3,xmm7 FREE */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* i0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
		"mulpd		0xc0(%%rdi)	,%%xmm3	\n\t	mulpd		0xc0(%%rbx)	,%%xmm7	\n\t"/* temp*baseinv[i1] */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps		%%xmm3	,(%%rcx)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"/* cy*base[i1] */\
		"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i1]) */\
		"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt */\
		"movaps		%%xmm1	,0x20(%%rax)\n\t	movaps		%%xmm5	,0x60(%%rax)\n\t"/* store x */\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*          Imaginary parts                   */\
	/**********************************************/\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw]	,%%rdx		\n\t"\
		"movaps	(%%rdx)	,%%xmm1			\n\t"/* sw[0:3] */\
		"psubd	%%xmm0	,%%xmm1			\n\t"/* sw[0:3] - bjmod[0:3] */\
		"movmskps	%%xmm1	,%%rsi		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$24	,%%rsi				\n\t"/* <i3|i2|i1|i0>; Packed indices into base,base_inv tables; move into leftmost byte[3] */\
		"movaps	%%xmm0	,%%xmm1			\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_silp1],%%rcx\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"/* n_minus_silp1 in low 32 bits of xmm2 */\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t"/* Broadcast low 32 bits of xmm2 to all 4 slots of xmm2 */\
		"psubd	%%xmm0	,%%xmm2			\n\t"/* n_minus_silp1 - bjmod[0:3] */\
		"movmskps	%%xmm2	,%%rcx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$16	,%%rcx				\n\t"/* <m3|m2|m1|m0>; Packed indices into base,base_inv tables; move into leftmost byte[2] of ecx... */\
		"addq	%%rcx	,%%rsi			\n\t"/* ....and fold into esi. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"/* sinwtm1 in low 32 bits of xmm3 */\
		"pshufd	$0	,%%xmm3	,%%xmm3		\n\t"/* Broadcast low 32 bits of xmm3 to all 4 slots of xmm3 */\
		"psubd	%%xmm3	,%%xmm1			\n\t"/* bjmod[0:3] - sinwtm1 */\
		"movmskps	%%xmm1	,%%rdx		\n\t"/* Extract sign bits into 4-bit signmask */\
		"shlq	$8	,%%rdx				\n\t"/* <n3|n2|n1|n0>; Packed indices into base,base_inv tables; move into leftmost byte[1] of edx... */\
		"addq	%%rdx	,%%rsi			\n\t"/* ....and fold into esi. */\
		"movq	%[__data]	,%%rax		\n\t"\
		"movaps	 0x30(%%rax)	,%%xmm1	\n\t	movaps	 0x70(%%rax)	,%%xmm5	\n\t"/* I1~ */\
		/* Don't explicitly load address of sse2_rnd, since we know it's in [half_arr - 0x10]. */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm6	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"/* wtC[j-1]; ecx FREE */\
	/*	movaps		     (%%rcx),%%xmm3	\n\t	movaps		-0x10(%%rcx),%%xmm7	\n\t	*/\
	/*	shufpd		$1 ,%%xmm3,%%xmm3	\n\t	shufpd		$1 ,%%xmm7,%%xmm7	\n\t	*/\
		"\n\t"\
		"addq	$0x20	,%%rbx			\n\t"/* add0 += 4 */\
		"subq	$0x20	,%%rcx			\n\t"/* add1 -= 4 */\
		"movq	%%rbx	,%[__wtA]		\n\t"\
		"movq	%%rcx	,%[__wtB]		\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtlp1,wtnm1 address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd	 0x1a0(%%rax)	,%%xmm2	\n\t	mulpd	 0x1a0(%%rax)	,%%xmm6	\n\t"\
		"mulpd	 0x1b0(%%rax)	,%%xmm3	\n\t	mulpd	 0x1b0(%%rax)	,%%xmm7	\n\t"\
		"mulpd	      (%%rdi)	,%%xmm2	\n\t	mulpd	      (%%rbx)	,%%xmm6	\n\t"\
		"mulpd	 0x040(%%rdx)	,%%xmm3	\n\t	mulpd	 0x040(%%rcx)	,%%xmm7	\n\t"\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		/* sign_mask still in xmm8: */\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* i0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps	%%xmm3	,%%xmm1			\n\t	movaps	%%xmm7	,%%xmm5			\n\t"/* cpy temp */\
		"mulpd	 0xc0(%%rdi)	,%%xmm3	\n\t	mulpd	 0xc0(%%rbx)	,%%xmm7	\n\t"/* temp*baseinv[i1] */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps	%%xmm3	,(%%rcx)		\n\t	movaps	%%xmm7	,(%%rdx)		\n\t"/* store cy_out */\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"/* cy*base[i1] */\
		"subpd	%%xmm3	,%%xmm1			\n\t	subpd	%%xmm7	,%%xmm5			\n\t"/* y = (temp-cy*base[i1]) */\
		"mulpd	%%xmm2	,%%xmm1			\n\t	mulpd	%%xmm6	,%%xmm5			\n\t"/* y*= wt */\
		"movaps	%%xmm1	, 0x30(%%rax)	\n\t	movaps	%%xmm5	, 0x70(%%rax)	\n\t"/* store y */\
		/* Get ready for next set [RE1~, IM1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd	(%%rax)	,%%xmm0			\n\t"/* bjmod[0:3] += bw  */\
		"pand	(%%rbx)	,%%xmm0			\n\t"/* bjmod[0:3] &= nm1 */\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"movaps	%%xmm0,(%%rcx)			\n\t"/* Write bjmod[0:3] */\
	/**********************************************/\
	/*              Repack the data:              */\
	/**********************************************/\
		"movq	%[__data],%%rax			\n\t"\
		"movaps		0x10(%%rax)	,%%xmm1	\n\t	movaps		0x50(%%rax)	,%%xmm5	\n\t"/* reload a[jp+p0 ] */\
		"movaps		    (%%rax)	,%%xmm0	\n\t	movaps		0x40(%%rax)	,%%xmm4	\n\t"/* reload a[jt+p0 ] */\
		"movaps		%%xmm1		,%%xmm3	\n\t	movaps		%%xmm5		,%%xmm7	\n\t"/* cpy a[jp    ] */\
		"movaps		%%xmm0		,%%xmm2	\n\t	movaps		%%xmm4		,%%xmm6	\n\t"/* cpy a[jt    ] */\
		"unpckhpd	0x30(%%rax)	,%%xmm3	\n\t	unpckhpd	0x70(%%rax)	,%%xmm7	\n\t"\
		"unpcklpd	0x30(%%rax)	,%%xmm1	\n\t	unpcklpd	0x70(%%rax)	,%%xmm5	\n\t"\
		"movaps		%%xmm3,0x30(%%rax)	\n\t	movaps		%%xmm7,0x70(%%rax)	\n\t"/* Store hi imag in aj2 */\
		"unpckhpd	0x20(%%rax)	,%%xmm2	\n\t	unpckhpd	0x60(%%rax)	,%%xmm6	\n\t"\
		"unpcklpd	0x20(%%rax)	,%%xmm0	\n\t	unpcklpd	0x60(%%rax)	,%%xmm4	\n\t"\
		"movaps		%%xmm2,0x20(%%rax)	\n\t	movaps		%%xmm6,0x60(%%rax)	\n\t"/* Store hi real in aj2 */\
		"movaps		%%xmm1,0x10(%%rax)	\n\t	movaps		%%xmm5,0x50(%%rax)	\n\t"/* a[jp+p0 ] */\
		"movaps		%%xmm0,    (%%rax)	\n\t	movaps		%%xmm4,0x40(%%rax)	\n\t"/* a[jt+p0 ] */\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 2 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm10"	/* Clobbered registers */\
	);\
	}

	/********* Fused fast-LOACC-algo [cf. carry.h::cmplx_carry_fast_pow2_errcheck] version of above 2 macros: ***********/

	// SSE2 macro to do the 2 x 4 cmplx_carry_fast_pow2_wtsinit() scalar-double macro init calls in 4-way parallel mode.
	// This is essentially the weights-computation portion of the SSE2_cmplx_carry_norm_pow2_errcheck[1,2]_2B macros,
	// with the computed weights and their inverses overwriting the input wtl,n data in local memory; the latter data's
	// addresses are fiddled w.r.to their value in the aforementioned carry-macros in order to match those of the outputs
	// of the scalar-double macro sequence.
	#define SSE2_cmplx_carry_fast_pow2_wtsinit(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xn_minus_sil2,Xn_minus_silp2,Xsinwt2,Xsinwtm2, Xsse_bw,Xsse_nm1)\
	{\
	__asm__ volatile (\
	/**********************************************/\
	/*  (j  ),  Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
		"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movaps		%%xmm0	,%%xmm1		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil],%%rsi	\n\t	movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"/* wtB[j-1]; ecx FREE */\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* [NOTE: movhpd/movlpd preferable to movupd/shufpd] */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"mulpd		0x1a0(%%rax),%%xmm3	\n\t	mulpd		0x1a0(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x180(%%rax)	\n\t	movaps		%%xmm4,0x190(%%rax)	\n\t"\
		"movaps		%%xmm3,0x1a0(%%rax)	\n\t	movaps		%%xmm5,0x1b0(%%rax)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*  (j  ),  Imaginary parts                   */\
	/**********************************************/\
		"movaps	%%xmm0	,%%xmm1			\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_silp1],%%rsi\n\t	movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"mulpd		0x1c0(%%rax),%%xmm2	\n\t	mulpd		0x1c0(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"mulpd		0x1e0(%%rax),%%xmm3	\n\t	mulpd		0x1e0(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into odd-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x1c0(%%rax)	\n\t	movaps		%%xmm4,0x1d0(%%rax)	\n\t"\
		"movaps		%%xmm3,0x1e0(%%rax)	\n\t	movaps		%%xmm5,0x1f0(%%rax)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd	(%%rax)	,%%xmm0			\n\t"/* bjmod[0:3] += bw  */\
		"pand	(%%rbx)	,%%xmm0			\n\t"/* bjmod[0:3] &= nm1 */\
		"\n\t"\
	/**********************************************/\
	/*  (j+2),  Real      parts                   */\
	/**********************************************/\
		"movaps		%%xmm0	,%%xmm1		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil2],%%rsi	\n\t	movslq	%[__sinwt2]	,%%rdx		\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		/* These multiplier addresses all += 8 complex (16-byte) slots w.r.to the j-data ones: */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"mulpd		0x200(%%rax),%%xmm2	\n\t	mulpd		0x200(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"mulpd		0x220(%%rax),%%xmm3	\n\t	mulpd		0x220(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x200(%%rax)	\n\t	movaps		%%xmm4,0x210(%%rax)	\n\t"\
		"movaps		%%xmm3,0x220(%%rax)	\n\t	movaps		%%xmm5,0x230(%%rax)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"movq	%[__sse_nm1]	,%%rbx	\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*  (j+2),  Imaginary parts                   */\
	/**********************************************/\
		"movaps	%%xmm0	,%%xmm1			\n\t"\
		"movslq	%[__n_minus_silp2],%%rsi\n\t	movslq	%[__sinwtm2]	,%%rdx	\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"mulpd		0x240(%%rax),%%xmm2	\n\t	mulpd		0x240(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"mulpd		0x260(%%rax),%%xmm3	\n\t	mulpd		0x260(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into odd-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x240(%%rax)	\n\t	movaps		%%xmm4,0x250(%%rax)	\n\t"\
		"movaps		%%xmm3,0x260(%%rax)	\n\t	movaps		%%xmm5,0x270(%%rax)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__n_minus_sil2]	"m" (Xn_minus_sil2)	\
		, [__n_minus_silp2] "m" (Xn_minus_silp2)\
		, [__sinwt2]	"m" (Xsinwt2)		\
		, [__sinwtm2]	"m" (Xsinwtm2)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5"	/* Clobbered registers */\
	);\
	}

	// "Fused" means that - like the HIACC macros - we process 4 carry chains, one from each separate array section corr.
	// to each wide-strided final-iFFT-pass output, at a time, but fuse the [j,j+2] linear-index-within-each-array-section
	// processing (done separately in the HIACC case by the 1_2B and 2_2B SSE2 carry macros) into a single macro. This
	// fusion is eased by the fact that the LOACC chained-weights-computation needs no weights-reinit-from-scalar-data
	// step for the [j+2] data.
	//
	#define SSE2_cmplx_carry_fast_pow2_errcheck(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
			"movq	%[__prp_mult]	,%%rax	\n\t"\
			"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
			"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
		"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"prefetcht0	(%%r14)		\n\t"\
		/***************Unpack the data:*************************/\
			"movq	%[__data]	,%%rax	\n\t"\
			"movaps		    (%%rax)	,%%xmm1	\n\t	movaps		0x40(%%rax)	,%%xmm5	\n\t"/* r1, this is the active  xmm register */\
			"movaps		    %%xmm1	,%%xmm2	\n\t	movaps			%%xmm5	,%%xmm6	\n\t"/* r1, this is the scratch xmm register */\
			"movaps		0x20(%%rax)	,%%xmm0	\n\t	movaps		0x60(%%rax)	,%%xmm4	\n\t"\
			"unpcklpd		%%xmm0	,%%xmm1	\n\t	unpcklpd		%%xmm4	,%%xmm5	\n\t"/* r1 -x- r3 (lo halves) ==> R0~ */\
			"unpckhpd		%%xmm0	,%%xmm2	\n\t	unpckhpd		%%xmm4	,%%xmm6	\n\t"/* r1 -x- r3 (hi halves) ==> R1~ */\
			"movaps		%%xmm2, 0x20(%%rax)	\n\t	movaps		%%xmm6, 0x60(%%rax)	\n\t"/* Tmp store R1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
			"\n\t"\
			"movaps		0x10(%%rax)	,%%xmm2	\n\t	movaps		0x50(%%rax)	,%%xmm6	\n\t"\
			"movaps			%%xmm2	,%%xmm3	\n\t	movaps			%%xmm6	,%%xmm7	\n\t"\
			"movaps		0x30(%%rax)	,%%xmm0	\n\t	movaps		0x70(%%rax)	,%%xmm4	\n\t"\
			"unpcklpd		%%xmm0	,%%xmm2	\n\t	unpcklpd		%%xmm4	,%%xmm6	\n\t"/* r2 -x- r4 (lo halves) ==> I0~ */\
			"unpckhpd		%%xmm0	,%%xmm3	\n\t	unpckhpd		%%xmm4	,%%xmm7	\n\t"/* r2 -x- r4 (hi halves) ==> I1~ */\
			"movaps		%%xmm2, 0x10(%%rax)	\n\t	movaps		%%xmm6, 0x50(%%rax)	\n\t"/* Tmp store I0~ until needed by imaginary-part-processing section */\
			"movaps		%%xmm3, 0x30(%%rax)	\n\t	movaps		%%xmm7, 0x70(%%rax)	\n\t"/* Tmp store I1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
		/* Active data in xmm1,5 here - avoid using those registers in index computation. */\
		/**********************************************/\
		/*          Real      parts                   */\
		/**********************************************/\
			"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
			"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3] */\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movslq	%[__i]	,%%rdi			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
			"xorq	%%rdi	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x180(%%rax),%%xmm2		\n\t	movaps	 0x190(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
			"movaps	-0x10(%%rax),%%xmm4		\n\t"/* sse2_rnd */\
			"movaps	-0x20(%%rax),%%xmm9		\n\t"/* maxerr */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x1a0(%%rax),%%xmm1		\n\t	mulpd	0x1b0(%%rax),%%xmm5		\n\t"/* x *= wi_re */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
			"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt_re */\
			"movaps		%%xmm1	,    (%%rax)\n\t	movaps		%%xmm5	,0x40(%%rax)\n\t"/* store x */\
		/* Update and store weights: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x1a0(%%rax),%%xmm3		\n\t	movaps	 0x1b0(%%rax),%%xmm7	\n\t"/* wi_re, inverse-wt_re */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_re *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_re *= inv_mult[i] */\
		"movaps	%%xmm2,0x180(%%rax)		\n\t	movaps	 %%xmm6,0x190(%%rax)	\n\t"/* Store wt_re */\
		"movaps	%%xmm3,0x1a0(%%rax)		\n\t	movaps	 %%xmm7,0x1b0(%%rax)	\n\t"/* Store wi_re */\
			/* Get ready for next set [IM0~] : */\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"movq	%[__sse_nm1],%%rbx		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
		/**********************************************/\
		/*          Imaginary parts                   */\
		/**********************************************/\
		"movslq		%[__p1],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x10(%%rax)	,%%xmm1	\n\t	movaps	 0x50(%%rax)	,%%xmm5	\n\t"/* I0~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x1c0(%%rax),%%xmm2		\n\t	movaps	 0x1d0(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x1e0(%%rax),%%xmm1		\n\t	mulpd	0x1f0(%%rax),%%xmm5		\n\t"/* y *= wi_im */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = y */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(y) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* y - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(y-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* y = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* y*= wt_im */\
			"movaps		%%xmm1	,0x10(%%rax)\n\t	movaps		%%xmm5	,0x50(%%rax)\n\t"/* store y */\
		/* Update and store weights: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x1e0(%%rax),%%xmm3		\n\t	movaps	 0x1f0(%%rax),%%xmm7	\n\t"/* wi_im, inverse-wt_im */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_im *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_im *= inv_mult[i] */\
		"movaps	%%xmm2,0x1c0(%%rax)		\n\t	movaps	 %%xmm6,0x1d0(%%rax)	\n\t"/* Store wt_im */\
		"movaps	%%xmm3,0x1e0(%%rax)		\n\t	movaps	 %%xmm7,0x1f0(%%rax)	\n\t"/* Store wi_im */\
			/* Get ready for next set [RE1~] : */\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"movq	%[__sse_nm1],%%rbx		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
	/**********************************************/\
	/*          Now do the (j+2) data:            */\
	/**********************************************/\
		"movslq		%[__p2],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		/**********************************************/\
		/*          Real      parts                   */\
		/**********************************************/\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x20(%%rax)	,%%xmm1	\n\t	movaps	 0x60(%%rax)	,%%xmm5	\n\t"/* R1~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x200(%%rax),%%xmm2		\n\t	movaps	 0x210(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x220(%%rax),%%xmm1		\n\t	mulpd	0x230(%%rax),%%xmm5		\n\t"/* x *= wi_re */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt_re */\
			"movaps		%%xmm1	,0x20(%%rax)\n\t	movaps		%%xmm5	,0x60(%%rax)\n\t"/* store x */\
		/* Update and store weights ... the (j+2) wts are in the 8 slots (half_arr+[0x200-0x270]) above the (j) ones: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x220(%%rax),%%xmm3		\n\t	movaps	 0x230(%%rax),%%xmm7	\n\t"/* wi_re, inverse-wt_re */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_re *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_re *= inv_mult[i] */\
		"movaps	%%xmm2,0x200(%%rax)		\n\t	movaps	 %%xmm6,0x210(%%rax)	\n\t"/* Store wt_re */\
		"movaps	%%xmm3,0x220(%%rax)		\n\t	movaps	 %%xmm7,0x230(%%rax)	\n\t"/* Store wi_re */\
			/* Get ready for next set [IM0~] : */\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"movq	%[__sse_nm1],%%rbx		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
		/**********************************************/\
		/*          Imaginary parts                   */\
		/**********************************************/\
		"movslq		%[__p3],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x30(%%rax)	,%%xmm1	\n\t	movaps	 0x70(%%rax)	,%%xmm5	\n\t"/* I1~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x240(%%rax),%%xmm2		\n\t	movaps	 0x250(%%rax),%%xmm6	\n\t"/* wt_im for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x260(%%rax),%%xmm1		\n\t	mulpd	0x270(%%rax),%%xmm5		\n\t"/* y *= wi_im */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = y */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(y) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* y - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(y-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movaps		%%xmm9,-0x20(%%rax)	\n\t"/* Store maxerr */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* y = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* y*= wt_im */\
			"movaps		%%xmm1	,0x30(%%rax)\n\t	movaps		%%xmm5	,0x70(%%rax)\n\t"/* store y */\
		/* Update and store weights ... the (j+2) wts are in the 8 slots (half_arr+[0x200-0x270]) above the (j) ones: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x260(%%rax),%%xmm3		\n\t	movaps	 0x270(%%rax),%%xmm7	\n\t"/* wi_im, inverse-wt_im */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_im *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_im *= inv_mult[i] */\
		"movaps	%%xmm2,0x240(%%rax)		\n\t	movaps	 %%xmm6,0x250(%%rax)	\n\t"/* Store wt_im */\
		"movaps	%%xmm3,0x260(%%rax)		\n\t	movaps	 %%xmm7,0x270(%%rax)	\n\t"/* Store wi_im */\
			/* Get ready for next set [RE1~] : */\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"movq	%[__sse_nm1],%%rbx		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"pand		(%%rbx)	,%%xmm0		\n\t"/* bjmod[0:3] &= nm1 */\
			"movq	%[__bjmod_0],%%rdi		\n\t"\
			"movaps	%%xmm0,(%%rdi)			\n\t"/* Write bjmod[0:3] */\
		/**********************************************/\
		/*              Repack the data:              */\
		/**********************************************/\
			"movq	%[__data],%%rax			\n\t"\
			"movaps		0x10(%%rax)	,%%xmm1	\n\t	movaps		0x50(%%rax)	,%%xmm5	\n\t"/* reload a[jp+p0 ] */\
			"movaps		0x30(%%rax)	,%%xmm0	\n\t	movaps		0x70(%%rax)	,%%xmm4	\n\t"\
			"movaps		%%xmm1		,%%xmm3	\n\t	movaps		%%xmm5		,%%xmm7	\n\t"/* cpy a[jp    ] */\
			"unpcklpd		%%xmm0	,%%xmm1	\n\t	unpcklpd		%%xmm4	,%%xmm5	\n\t"\
			"unpckhpd		%%xmm0	,%%xmm3	\n\t	unpckhpd		%%xmm4	,%%xmm7	\n\t"\
			"movaps		%%xmm1,0x10(%%rax)	\n\t	movaps		%%xmm5,0x50(%%rax)	\n\t"/* store a[jp+p0 ] */\
			"movaps		%%xmm3,0x30(%%rax)	\n\t	movaps		%%xmm7,0x70(%%rax)	\n\t"/* Store hi imag in aj2 */\
			"movaps		    (%%rax)	,%%xmm0	\n\t	movaps		0x40(%%rax)	,%%xmm4	\n\t"/* reload a[jt+p0 ] */\
			"movaps		0x20(%%rax)	,%%xmm1	\n\t	movaps		0x60(%%rax)	,%%xmm5	\n\t"\
			"movaps		%%xmm0		,%%xmm2	\n\t	movaps		%%xmm4		,%%xmm6	\n\t"/* cpy a[jt    ] */\
			"unpckhpd		%%xmm1	,%%xmm2	\n\t	unpckhpd		%%xmm5	,%%xmm6	\n\t"\
			"unpcklpd		%%xmm1	,%%xmm0	\n\t	unpcklpd		%%xmm5	,%%xmm4	\n\t"\
			"movaps		%%xmm2,0x20(%%rax)	\n\t	movaps		%%xmm6,0x60(%%rax)	\n\t"/* Store hi real in aj2 */\
			"movaps		%%xmm0,    (%%rax)	\n\t	movaps		%%xmm4,0x40(%%rax)	\n\t"/* store a[jt+p0 ] */\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__i]			"m" (Xi)			\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10"	/* Clobbered registers */\
	);\
	}

	// 2nd version of above, which includes the loop-control and pointer-updates - tried this using radix-32 carry
	// (64-bit SSE2 only), proved slower than C-loop version, so just stash the macro here for possible future use:
	#define SSE2_cmplx_carry_fast_pow2_loop(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_nm1,Xsse_sw, Xadd0,Xp1,Xp2,Xp3,Xpoff, Xloop)\
	{\
	__asm__ volatile (\
		"xorq	%%r15,%%r15		\n\t"/* Init r15 = 0; thus pfetch-offsets will be poff = 0,radix-4,radix-8, ... ,8,4.  */\
		"movslq	%[__loop], %%rcx	\n\t"/* ASM loop structured as while(loop != 0; --loop){...} */\
	"1:	\n\t"/* loop-start label */\
		"movq	%[__add0],%%r14			\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"leaq	(%%r14,%%r15,8),%%r14	\n\t"/* Elts of poff[] are indices into main double-array, so multiply by 8 to get corr. byte-offset */\
		"prefetcht0	(%%r14)		\n\t"\
	/* [copy guts of SSE2_cmplx_carry_fast_pow2_errcheck to here] */\
		/* Set i = 0 and increment the loop-variable pointers - first 3 are vec_dbl* [16 bytes], bjmodn is int* [4 bytes]: */\
			"movl	$0, %[__i]	\n\t"/* i is 32-bit, thus use movl, not movq */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movq	%[__cyA]	,%%rbx		\n\t"\
			"movq	%[__cyB]	,%%rdx		\n\t"\
			"movq	%[__bjmod_0],%%rsi		\n\t"\
			"addq	$0x80,%%rax				\n\t"/* data += 8 */\
			"addq	$0x20,%%rbx				\n\t"/* cyA  += 2 */\
			"addq	$0x20,%%rdx				\n\t"/* cyB  += 2 */\
			"addq	$0x10,%%rsi				\n\t"/* bjmodn += 4 */\
			"movq	%%rax,%[__data]			\n\t"\
			"movq	%%rbx,%[__cyA]			\n\t"\
			"movq	%%rdx,%[__cyB]			\n\t"\
			"movq	%%rsi,%[__bjmod_0]		\n\t"\
		"decq	%%rcx \n\t"\
		"movq	%[__poff],%%r15			\n\t"/* poff[] = p0,4,8,... */\
		"leaq	(%%r15,%%rcx,4),%%r15	\n\t"/* Dereference the poff array-element ptr ... our loop index (rcx) doubles as idx into poff[] */\
	"jnz 1b 	\n\t"/* loop[1] end; continue is via jump-back if rcx != 0 */\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__i]			"m" (Xi)			\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_nm1]	"m" (Xsse_nm1)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		,	[__poff] "m" (Xpoff)\
		/* Loop index: */\
		,	[__loop] "m" (Xloop)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9"/* Clobbered registers */\
	);\
	}

#endif	// #ifdef USE_AVX

#if !defined(USE_ARM_V8_SIMD)	// 64-bit SSE2:

	/***************************************************************************************************************************************************/
	/********* Non-power-of-2-FFT versions of SSE2_cmplx_carry_norm_pow2_errcheck0_2B,1_2B,2_2B (only give sans-error-check version of latter 2: *******/
	/***************************************************************************************************************************************************/

	#define SSE2_cmplx_carry_norm_errcheck1_2B(Xdata,XwtA,XwtB,XwtC,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
		"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"prefetcht0	(%%r14)		\n\t"\
	/***************Unpack the data:*************************/\
		"movq	%[__data]	,%%rax	\n\t"\
		"movaps		    (%%rax)	,%%xmm1	\n\t	movaps		0x40(%%rax)	,%%xmm5	\n\t"\
		"movaps		    (%%rax)	,%%xmm2	\n\t	movaps		0x40(%%rax)	,%%xmm6	\n\t"\
		"unpcklpd	0x20(%%rax)	,%%xmm1	\n\t	unpcklpd	0x60(%%rax)	,%%xmm5	\n\t"\
		"unpckhpd	0x20(%%rax)	,%%xmm2	\n\t	unpckhpd	0x60(%%rax)	,%%xmm6	\n\t"\
		"movaps		%%xmm2, 0x20(%%rax)	\n\t	movaps		%%xmm6, 0x60(%%rax)	\n\t"\
		"\n\t"\
		"movaps		0x10(%%rax)	,%%xmm2	\n\t	movaps		0x50(%%rax)	,%%xmm6	\n\t"\
		"movaps		0x10(%%rax)	,%%xmm3	\n\t	movaps		0x50(%%rax)	,%%xmm7	\n\t"\
		"unpcklpd	0x30(%%rax)	,%%xmm2	\n\t	unpcklpd	0x70(%%rax)	,%%xmm6	\n\t"\
		"unpckhpd	0x30(%%rax)	,%%xmm3	\n\t	unpckhpd	0x70(%%rax)	,%%xmm7	\n\t"\
		"movaps		%%xmm2, 0x10(%%rax)	\n\t	movaps		%%xmm6, 0x50(%%rax)	\n\t"\
		"movaps		%%xmm3, 0x30(%%rax)	\n\t	movaps		%%xmm7, 0x70(%%rax)	\n\t"\
	/**********************************************/\
	/*          Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"\
		"movaps		(%%rax)	,	%%xmm0	\n\t"\
		"movq	%[__sse_sw]	,	%%rbx	\n\t"\
		"movaps		(%%rbx)	,	%%xmm7	\n\t"\
		"psubd		%%xmm0	,	%%xmm7	\n\t"\
		"movmskps	%%xmm7	,	%%rsi	\n\t"\
	"movslq	%[__i]	,%%rcx			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
	"xorq	%%rcx	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
		"shlq	$24		,%%rsi			\n\t"\
		"movaps		%%xmm0	,%%xmm7		\n\t"\
		"movslq	%[__n_minus_sil],%%rcx	\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"\
		"pshufd	$0,	%%xmm2	,%%xmm2		\n\t"\
		"psubd		%%xmm0	,%%xmm2		\n\t"\
		"movmskps	%%xmm2	,%%rcx		\n\t"\
		"shlq	$16		,%%rcx			\n\t"\
		"addq	%%rcx	,%%rsi			\n\t"\
		"movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0,	%%xmm3	,%%xmm3		\n\t"\
		"psubd		%%xmm3	,%%xmm7		\n\t"\
		"movmskps	%%xmm7	,%%rdx		\n\t"\
		"shlq	$8		,%%rdx			\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		-0x10(%%rax),%%xmm4	\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm6	\n\t"\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"\
		"\n\t"\
		"subq	$0x20	,%%rcx			\n\t"\
		"movq	%%rcx	,%[__wtB]		\n\t"\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtl,wtn address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm6	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x190(%%rax),%%xmm3	\n\t	mulpd		0x190(%%rax),%%xmm7	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm6	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm7	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"\
		"mulpd		0xc0(%%rdi)	,%%xmm3	\n\t	mulpd		0xc0(%%rbx)	,%%xmm7	\n\t"\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps		%%xmm3	,(%%rcx)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"\
		"\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"\
		"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"\
		"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"\
		"movaps		%%xmm1	,    (%%rax)\n\t	movaps		%%xmm5	,0x40(%%rax)\n\t"\
		"\n\t"\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"\
	/**********************************************/\
	/*          Imaginary parts                   */\
	/**********************************************/\
	"movslq		%[__p1],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw]	,%%rdx		\n\t"\
		"movaps	(%%rdx)	,%%xmm1			\n\t"\
		"psubd	%%xmm0	,%%xmm1			\n\t"\
		"movmskps	%%xmm1	,%%rsi		\n\t"\
		"shlq	$24	,%%rsi				\n\t"\
		"movaps	%%xmm0	,%%xmm1			\n\t"\
		"movslq	%[__n_minus_silp1],%%rcx\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t"\
		"movmskps	%%xmm2	,%%rcx		\n\t"\
		"shlq	$16	,%%rcx				\n\t"\
		"addq	%%rcx	,%%rsi			\n\t"\
		"movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"movaps	 0x10(%%rax)	,%%xmm1	\n\t"\
		"movaps	 0x50(%%rax)	,%%xmm5	\n\t"\
		"\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"\
		"\n\t"\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm6	\n\t"\
		"movhpd	     (%%rcx)	,%%xmm3	\n\t	movhpd	-0x10(%%rcx)	,%%xmm7	\n\t"\
		"movlpd	 0x08(%%rcx)	,%%xmm3	\n\t	movlpd	-0x08(%%rcx)	,%%xmm7	\n\t"\
		"\n\t"\
		"addq	$0x20	,%%rbx			\n\t"\
		"subq	$0x20	,%%rcx			\n\t"\
		"movq	%%rbx	,%[__wtA]		\n\t"\
		"movq	%%rcx	,%[__wtC]		\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtlp1,wtnm1 address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd	 0x1a0(%%rax)	,%%xmm2	\n\t	mulpd	 0x1a0(%%rax)	,%%xmm6	\n\t"/* wt   =wtA*wtlp1 */\
		"mulpd	 0x1b0(%%rax)	,%%xmm3	\n\t	mulpd	 0x1b0(%%rax)	,%%xmm7	\n\t"/* wtinv=wtC*wtnm1 */\
		"mulpd	      (%%rdi)	,%%xmm2	\n\t	mulpd	      (%%rbx)	,%%xmm6	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd	 0x040(%%rdx)	,%%xmm3	\n\t	mulpd	 0x040(%%rcx)	,%%xmm7	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		/* sign_mask still in xmm8: */\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps	%%xmm3	,%%xmm1			\n\t	movaps	%%xmm7	,%%xmm5			\n\t"\
		"mulpd	 0xc0(%%rdi)	,%%xmm3	\n\t	mulpd	 0xc0(%%rbx)	,%%xmm7	\n\t"\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps	%%xmm3	,(%%rcx)		\n\t	movaps	%%xmm7	,(%%rdx)		\n\t"\
		"\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"\
		"subpd	%%xmm3	,%%xmm1			\n\t	subpd	%%xmm7	,%%xmm5			\n\t"\
		"mulpd	%%xmm2	,%%xmm1			\n\t	mulpd	%%xmm6	,%%xmm5			\n\t"\
		"movaps	%%xmm1	, 0x10(%%rax)	\n\t	movaps	%%xmm5	, 0x50(%%rax)	\n\t"\
		"\n\t"\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"movaps	%%xmm0,(%%rcx)			\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__i]			"m" (Xi)			\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 1 index offset */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm10"	/* Clobbered registers */\
	);\
	}

	#define SSE2_cmplx_carry_norm_errcheck2_2B(Xdata,XwtA,XwtB,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xn_minus_silp1,Xn_minus_sil,Xsign_mask,Xsinwt,Xsinwtm1,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
		"movq	%[__prp_mult]	,%%rax	\n\t"\
		"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
		"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
	"movq	%[__add0],%%r14	\n\t"/* base address for 2 prefetches-from-main-data-array spread through this macro */\
	"movslq		%[__p2],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
	/**********************************************/\
	/*          Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"\
		"movaps		(%%rax)	,	%%xmm0	\n\t"\
		"movq	%[__sse_sw]	,	%%rbx	\n\t"\
		"movaps		(%%rbx)	,	%%xmm1	\n\t"\
		"psubd		%%xmm0	,	%%xmm1	\n\t"\
		"movmskps	%%xmm1	,	%%rsi	\n\t"\
		"\n\t"\
		"shlq	$24		,%%rsi			\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"movslq	%[__n_minus_sil],%%rcx	\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"\
		"pshufd	$0,	%%xmm2	,%%xmm2		\n\t"\
		"psubd		%%xmm0	,%%xmm2		\n\t"\
		"movmskps	%%xmm2	,%%rcx		\n\t"\
		"shlq	$16		,%%rcx			\n\t"\
		"addq	%%rcx	,%%rsi			\n\t"\
		"movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0,	%%xmm3	,%%xmm3		\n\t"\
		"psubd		%%xmm3	,%%xmm1		\n\t"\
		"movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$8		,%%rdx			\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"\n\t"\
		"movq	%[__data],%%rax			\n\t"\
		"movaps	 0x20(%%rax),%%xmm1		\n\t	movaps		 0x60(%%rax),%%xmm5\n\t"\
		"\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		-0x10(%%rax),%%xmm4	\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm6	\n\t"\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm7	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm7	\n\t"\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtlp1,wtnm1 address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm6	\n\t"\
		"mulpd		0x190(%%rax),%%xmm3	\n\t	mulpd		0x190(%%rax),%%xmm7	\n\t"\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm6	\n\t"\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm7	\n\t"\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"\
		"mulpd		0xc0(%%rdi)	,%%xmm3	\n\t	mulpd		0xc0(%%rbx)	,%%xmm7	\n\t"\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps		%%xmm3	,(%%rcx)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"\
		"\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"\
		"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"\
		"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"\
		"movaps		%%xmm1	,0x20(%%rax)\n\t	movaps		%%xmm5	,0x60(%%rax)\n\t"\
		"\n\t"\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"\
	/**********************************************/\
	/*          Imaginary parts                   */\
	/**********************************************/\
	"movslq		%[__p3],%%r15	\n\t"\
	"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		"movq	%[__sse_sw]	,%%rdx		\n\t"\
		"movaps	(%%rdx)	,%%xmm1			\n\t"\
		"psubd	%%xmm0	,%%xmm1			\n\t"\
		"movmskps	%%xmm1	,%%rsi		\n\t"\
		"shlq	$24	,%%rsi				\n\t"\
		"movaps	%%xmm0	,%%xmm1			\n\t"\
		"movslq	%[__n_minus_silp1],%%rcx\n\t"\
		"movd	%%rcx	,%%xmm2			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t"\
		"movmskps	%%xmm2	,%%rcx		\n\t"\
		"shlq	$16	,%%rcx				\n\t"\
		"addq	%%rcx	,%%rsi			\n\t"\
		"movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"movaps	 0x30(%%rax)	,%%xmm1	\n\t"\
		"movaps	 0x70(%%rax)	,%%xmm5	\n\t"\
		"\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"\n\t"\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm6	\n\t"\
		"movhpd	     (%%rcx)	,%%xmm3	\n\t	movhpd	-0x10(%%rcx)	,%%xmm7	\n\t"\
		"movlpd	 0x08(%%rcx)	,%%xmm3	\n\t	movlpd	-0x08(%%rcx)	,%%xmm7	\n\t"\
		"\n\t"\
		"addq	$0x20	,%%rbx			\n\t"\
		"subq	$0x20	,%%rcx			\n\t"\
		"movq	%%rbx	,%[__wtA]		\n\t"\
		"movq	%%rcx	,%[__wtB]		\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		/* v20: wtlp1,wtnm1 address offsets += 0x80 due to harmonization of HIACC & LOACC data layouts: */\
		"mulpd	 0x1a0(%%rax)	,%%xmm2	\n\t	mulpd	 0x1a0(%%rax)	,%%xmm6	\n\t"\
		"mulpd	 0x1b0(%%rax)	,%%xmm3	\n\t	mulpd	 0x1b0(%%rax)	,%%xmm7	\n\t"\
		"mulpd	      (%%rdi)	,%%xmm2	\n\t	mulpd	      (%%rbx)	,%%xmm6	\n\t"\
		"mulpd	 0x040(%%rdx)	,%%xmm3	\n\t	mulpd	 0x040(%%rcx)	,%%xmm7	\n\t"\
		"\n\t"\
		"movq	%[__cyA]	,%%rcx		\n\t	movq	%[__cyB]	,%%rdx		\n\t"\
		"mulpd		%%xmm3	,%%xmm1		\n\t	mulpd		%%xmm7	,%%xmm5		\n\t"/* x *= wi_re */\
		"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
		/* sign_mask still in xmm8: */\
		"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
		"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
		"maxpd			%%xmm5	,%%xmm1	\n\t"\
		"maxpd		-0x20(%%rax),%%xmm1	\n\t"\
		"movaps		%%xmm1,-0x20(%%rax)	\n\t"/* if(frac > maxerr) maxerr=frac */\
		"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
		"addpd		(%%rcx),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
		"\n\t"\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$20,	%%rdi			\n\t	shrq	$22,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movaps	%%xmm3	,%%xmm1			\n\t	movaps	%%xmm7	,%%xmm5			\n\t"\
		"mulpd	 0xc0(%%rdi)	,%%xmm3	\n\t	mulpd	 0xc0(%%rbx)	,%%xmm7	\n\t"\
		"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
		"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"\
		/*roundpd	$0,%%xmm3,%%xmm3	\n\t	roundpd		$0,%%xmm7,%%xmm7	*/\
		"movaps	%%xmm3	,(%%rcx)		\n\t	movaps	%%xmm7	,(%%rdx)		\n\t"\
		"\n\t"\
		"movq	%[__data]	,%%rax		\n\t"\
		"mulpd	 0x80(%%rdi)	,%%xmm3	\n\t	mulpd	 0x80(%%rbx)	,%%xmm7	\n\t"\
		"subpd	%%xmm3	,%%xmm1			\n\t	subpd	%%xmm7	,%%xmm5			\n\t"\
		"mulpd	%%xmm2	,%%xmm1			\n\t	mulpd	%%xmm6	,%%xmm5			\n\t"\
		"movaps	%%xmm1	, 0x30(%%rax)	\n\t	movaps	%%xmm5	, 0x70(%%rax)	\n\t"\
		"\n\t"\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"\
		"movq	%[__bjmod_0],%%rcx		\n\t"\
		"movaps	%%xmm0,(%%rcx)			\n\t"\
	/**********************************************/\
	/*              Repack the data:              */\
	/**********************************************/\
		"movq	%[__data],%%rax			\n\t"\
		"movaps		0x10(%%rax)	,%%xmm1	\n\t	movaps		0x50(%%rax)	,%%xmm5	\n\t"\
		"movaps		    (%%rax)	,%%xmm0	\n\t	movaps		0x40(%%rax)	,%%xmm4	\n\t"\
		"movaps		%%xmm1		,%%xmm3	\n\t	movaps		%%xmm5		,%%xmm7	\n\t"\
		"movaps		%%xmm0		,%%xmm2	\n\t	movaps		%%xmm4		,%%xmm6	\n\t"\
		"unpckhpd	0x30(%%rax)	,%%xmm3	\n\t	unpckhpd	0x70(%%rax)	,%%xmm7	\n\t"\
		"unpcklpd	0x30(%%rax)	,%%xmm1	\n\t	unpcklpd	0x70(%%rax)	,%%xmm5	\n\t"\
		"movaps		%%xmm3,0x30(%%rax)	\n\t	movaps		%%xmm7,0x70(%%rax)	\n\t"\
		"unpckhpd	0x20(%%rax)	,%%xmm2	\n\t	unpckhpd	0x60(%%rax)	,%%xmm6	\n\t"\
		"unpcklpd	0x20(%%rax)	,%%xmm0	\n\t	unpcklpd	0x60(%%rax)	,%%xmm4	\n\t"\
		"movaps		%%xmm2,0x20(%%rax)	\n\t	movaps		%%xmm6,0x60(%%rax)	\n\t"\
		"movaps		%%xmm1,0x10(%%rax)	\n\t	movaps		%%xmm5,0x50(%%rax)	\n\t"\
		"movaps		%%xmm0,    (%%rax)	\n\t	movaps		%%xmm4,0x40(%%rax)	\n\t"\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__wtA]		"m" (XwtA)		\
		, [__wtB]		"m" (XwtB)		\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 2 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm10"	/* Clobbered registers */\
	);\
	}

	// SSE2 macro to do the 2 x 4 cmplx_carry_fast_pow2_wtsinit() scalar-double macro init calls in 4-way parallel mode.
	// This is essentially the weights-computation portion of the SSE2_cmplx_carry_norm_pow2_errcheck[1,2]_2B macros,
	// with the computed weights and their inverses overwriting the input wtl,n data in local memory; the latter data's
	// addresses are fiddled w.r.to their value in the aforementioned carry-macros in order to match those of the outputs
	// of the scalar-double macro sequence.
	#define SSE2_cmplx_carry_fast_wtsinit(XwtA,XwtB,XwtC, Xbjmod_0, Xhalf_arr,Xsign_mask, Xn_minus_sil,Xn_minus_silp1,Xsinwt,Xsinwtm1, Xn_minus_sil2,Xn_minus_silp2,Xsinwt2,Xsinwtm2, Xsse_bw,Xsse_n)\
	{\
	__asm__ volatile (\
	/**********************************************/\
	/*  (j  ),  Real      parts                   */\
	/**********************************************/\
		"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
		"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3]. PERSISTENT COPY OF BJMOD[0:3] REMAINS IN xmm0. */\
		"movaps		%%xmm0	,%%xmm1		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil],%%rsi	\n\t	movslq	%[__sinwt]	,%%rdx		\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtB]	,%%rcx		\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"/* wtB[j-1]; ecx FREE */\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* [NOTE: movhpd/movlpd preferable to movupd/shufpd] */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x180(%%rax),%%xmm2	\n\t	mulpd		0x180(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x1a0(%%rax),%%xmm3	\n\t	mulpd		0x1a0(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x180(%%rax)	\n\t	movaps		%%xmm4,0x190(%%rax)	\n\t"\
		"movaps		%%xmm3,0x1a0(%%rax)	\n\t	movaps		%%xmm5,0x1b0(%%rax)	\n\t"\
		/* Get ready for next set [IM0~] : */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
	/**********************************************/\
	/*  (j  ),  Imaginary parts                   */\
	/**********************************************/\
		"movaps	%%xmm0	,%%xmm1			\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_silp1],%%rsi\n\t	movslq	%[__sinwtm1]	,%%rdx	\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x1c0(%%rax),%%xmm2	\n\t	mulpd		0x1c0(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x1e0(%%rax),%%xmm3	\n\t	mulpd		0x1e0(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into odd-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x1c0(%%rax)	\n\t	movaps		%%xmm4,0x1d0(%%rax)	\n\t"\
		"movaps		%%xmm3,0x1e0(%%rax)	\n\t	movaps		%%xmm5,0x1f0(%%rax)	\n\t"\
		/* Get ready for next set [RE1~] : */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		"\n\t"\
	/**********************************************/\
	/*  (j+2),  Real      parts                   */\
	/**********************************************/\
		"movaps		%%xmm0	,%%xmm1		\n\t"/* bjmod[0:3] COPY */\
		"movslq	%[__n_minus_sil2],%%rsi	\n\t	movslq	%[__sinwt2]	,%%rdx		\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"\
		"movaps		     (%%rbx),%%xmm2	\n\t	movaps		 0x10(%%rbx),%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		/* These multiplier addresses all += 8 complex (16-byte) slots w.r.to the j-data ones: */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x200(%%rax),%%xmm2	\n\t	mulpd		0x200(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x220(%%rax),%%xmm3	\n\t	mulpd		0x220(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into even-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x200(%%rax)	\n\t	movaps		%%xmm4,0x210(%%rax)	\n\t"\
		"movaps		%%xmm3,0x220(%%rax)	\n\t	movaps		%%xmm5,0x230(%%rax)	\n\t"\
		/* Get ready for next set [IM1~] : */\
		"movq	%[__sse_n]	,%%rbx		\n\t"\
		"movaps		(%%rbx)	,%%xmm2		\n\t"\
		"movq	%[__sse_bw]	,%%rax		\n\t"\
		"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
		"movaps		%%xmm0	,%%xmm1		\n\t"\
		"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
		"pand		%%xmm2	,%%xmm1		\n\t"\
		"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
	/**********************************************/\
	/*  (j+2),  Imaginary parts                   */\
	/**********************************************/\
		"movaps	%%xmm0	,%%xmm1			\n\t"\
		"movslq	%[__n_minus_silp2],%%rsi\n\t	movslq	%[__sinwtm2]	,%%rdx	\n\t"\
		"movd	%%rsi	,%%xmm2			\n\t	movd	%%rdx	,%%xmm3			\n\t"\
		"pshufd	$0	,%%xmm2	,%%xmm2		\n\t	pshufd	$0	,%%xmm3	,%%xmm3		\n\t"\
		"psubd	%%xmm0	,%%xmm2			\n\t	psubd	%%xmm3	,%%xmm1			\n\t"\
		"movmskps	%%xmm2	,%%rsi		\n\t	movmskps	%%xmm1	,%%rdx		\n\t"\
		"shlq	$16	,%%rsi				\n\t	shlq	$8	,%%rdx				\n\t"\
		"addq	%%rdx	,%%rsi			\n\t"\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movq	%[__wtA]	,%%rbx		\n\t"\
		"movq	%[__wtC]	,%%rcx		\n\t"/* wtB == wtC for this latter set of carries */\
		"movaps	     (%%rbx)	,%%xmm2	\n\t	movaps	 0x10(%%rbx)	,%%xmm4	\n\t"/* wtA[j  ]; ebx FREE */\
		"movhpd		     (%%rcx),%%xmm3	\n\t	movhpd		-0x10(%%rcx),%%xmm5	\n\t"\
		"movlpd		 0x08(%%rcx),%%xmm3	\n\t	movlpd		-0x08(%%rcx),%%xmm5	\n\t"/* wtC[j-1]; ecx FREE */\
		"movq	%%rsi,	%%rdi			\n\t	movq	%%rsi,	%%rbx			\n\t"\
		"shrq	$12,	%%rdi			\n\t	shrq	$14,	%%rbx			\n\t"\
		"andq	$0x30,	%%rdi			\n\t	andq	$0x30,	%%rbx			\n\t"/* m0 */\
		"addq	%%rax,	%%rdi			\n\t	addq	%%rax,	%%rbx			\n\t"\
		"movq	%%rsi,	%%rdx			\n\t	movq	%%rsi,	%%rcx			\n\t"\
		"shrq	$4 ,	%%rdx			\n\t	shrq	$6 ,	%%rcx			\n\t"\
		"andq	$0x30,	%%rdx			\n\t	andq	$0x30,	%%rcx			\n\t"/* m2 */\
		"addq	%%rax,	%%rdx			\n\t	addq	%%rax,	%%rcx			\n\t"\
		"mulpd		0x240(%%rax),%%xmm2	\n\t	mulpd		0x240(%%rax),%%xmm4	\n\t"/* wt   =wtA*wtl */\
		"mulpd		0x260(%%rax),%%xmm3	\n\t	mulpd		0x260(%%rax),%%xmm5	\n\t"/* wtinv=wtB*wtn */\
		"mulpd		     (%%rdi),%%xmm2	\n\t	mulpd		     (%%rbx),%%xmm4	\n\t"/* wt   =wt   *one_half[m01] */\
		"mulpd		0x040(%%rdx),%%xmm3	\n\t	mulpd		0x040(%%rcx),%%xmm5	\n\t"/* wtinv=wtinv*one_half[4+m23] */\
		/* Results go into odd-index slots, overwriting the wtl,n multipliers in the bottom 2 of same: */\
		"movaps		%%xmm2,0x240(%%rax)	\n\t	movaps		%%xmm4,0x250(%%rax)	\n\t"\
		"movaps		%%xmm3,0x260(%%rax)	\n\t	movaps		%%xmm5,0x270(%%rax)	\n\t"\
		/* No final update/write of modified bjmod[0:3] back to mem here because init macro must leave them unchanged. */\
		:					/* outputs: none */\
		: [__wtA]		"m" (XwtA)	/* All inputs from memory addresses here */\
		, [__wtB]		"m" (XwtB)		\
		, [__wtC]		"m" (XwtC)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__n_minus_sil]	"m" (Xn_minus_sil)	\
		, [__n_minus_silp1] "m" (Xn_minus_silp1)\
		, [__sinwt]		"m" (Xsinwt)		\
		, [__sinwtm1]	"m" (Xsinwtm1)		\
		, [__n_minus_sil2]	"m" (Xn_minus_sil2)	\
		, [__n_minus_silp2] "m" (Xn_minus_silp2)\
		, [__sinwt2]	"m" (Xsinwt2)		\
		, [__sinwtm2]	"m" (Xsinwtm2)		\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5"	/* Clobbered registers */\
	);\
	}

	/********* Fused fast-LOACC-algo [cf. carry.h::cmplx_carry_fast_pow2_errcheck] version of above 2 macros: ***********/
	// "Fused" means that - like the HIACC macros - we process 4 carry chains, one from each separate array section corr.
	// to each wide-strided final-iFFT-pass output, at a time, but fuse the [j,j+2] linear-index-within-each-array-section
	// processing (done separately in the HIACC case by the 1_2B and 2_2B SSE2 carry macros) into a single macro. This
	// fusion is eased by the fact that the LOACC chained-weights-computation needs no weights-reinit-from-scalar-data
	// step for the [j+2] data.
	//
	#define SSE2_cmplx_carry_fast_errcheck(Xdata,XcyA,XcyB,Xbjmod_0,Xhalf_arr,Xi,Xsign_mask,Xsse_bw,Xsse_n,Xsse_sw, Xadd0,Xp1,Xp2,Xp3, Xprp_mult)\
	{\
	__asm__ volatile (\
			"movq	%[__prp_mult]	,%%rax	\n\t"\
			"movsd		(%%rax)	,	%%xmm10	\n\t"/* prp_mult */\
			"shufpd		$0,%%xmm10,%%xmm10	\n\t"/* prp_mult, broadcast to both double-slots of xmm10 */\
		"movq	%[__add0],%%r14	\n\t"/* base address for 4 prefetches-from-main-data-array spread through this macro */\
		"prefetcht0	(%%r14)		\n\t"\
		/***************Unpack the data:*************************/\
			"movq	%[__data]	,%%rax	\n\t"\
			"movaps		    (%%rax)	,%%xmm1	\n\t	movaps		0x40(%%rax)	,%%xmm5	\n\t"/* r1, this is the active  xmm register */\
			"movaps		    %%xmm1	,%%xmm2	\n\t	movaps			%%xmm5	,%%xmm6	\n\t"/* r1, this is the scratch xmm register */\
			"movaps		0x20(%%rax)	,%%xmm0	\n\t	movaps		0x60(%%rax)	,%%xmm4	\n\t"\
			"unpcklpd		%%xmm0	,%%xmm1	\n\t	unpcklpd		%%xmm4	,%%xmm5	\n\t"/* r1 -x- r3 (lo halves) ==> R0~ */\
			"unpckhpd		%%xmm0	,%%xmm2	\n\t	unpckhpd		%%xmm4	,%%xmm6	\n\t"/* r1 -x- r3 (hi halves) ==> R1~ */\
			"movaps		%%xmm2, 0x20(%%rax)	\n\t	movaps		%%xmm6, 0x60(%%rax)	\n\t"/* Tmp store R1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
			"\n\t"\
			"movaps		0x10(%%rax)	,%%xmm2	\n\t	movaps		0x50(%%rax)	,%%xmm6	\n\t"\
			"movaps			%%xmm2	,%%xmm3	\n\t	movaps			%%xmm6	,%%xmm7	\n\t"\
			"movaps		0x30(%%rax)	,%%xmm0	\n\t	movaps		0x70(%%rax)	,%%xmm4	\n\t"\
			"unpcklpd		%%xmm0	,%%xmm2	\n\t	unpcklpd		%%xmm4	,%%xmm6	\n\t"/* r2 -x- r4 (lo halves) ==> I0~ */\
			"unpckhpd		%%xmm0	,%%xmm3	\n\t	unpckhpd		%%xmm4	,%%xmm7	\n\t"/* r2 -x- r4 (hi halves) ==> I1~ */\
			"movaps		%%xmm2, 0x10(%%rax)	\n\t	movaps		%%xmm6, 0x50(%%rax)	\n\t"/* Tmp store I0~ until needed by imaginary-part-processing section */\
			"movaps		%%xmm3, 0x30(%%rax)	\n\t	movaps		%%xmm7, 0x70(%%rax)	\n\t"/* Tmp store I1~ until needed on 2nd set of SSE2_cmplx_carry.calls */\
		/* Active data in xmm1,5 here - avoid using those registers in index computation. */\
		/**********************************************/\
		/*          Real      parts                   */\
		/**********************************************/\
			"movq	%[__bjmod_0],	%%rax	\n\t"/* Pointer to bjmodn data */\
			"movaps		(%%rax)	,	%%xmm0	\n\t"/* bjmod[0:3] */\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movslq	%[__i]	,%%rdi			\n\t"/* I == 1 if it's the 0-word, in which case we force-bigword-ness by XORing esi (whose */\
			"xorq	%%rdi	,%%rsi			\n\t"/* low bit will == 0 on input in this case) with I. Otherwise I == 0, thus XOR = no-op. */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x180(%%rax),%%xmm2		\n\t	movaps	 0x190(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
			"movaps	-0x10(%%rax),%%xmm4		\n\t"/* sse2_rnd */\
			"movaps	-0x20(%%rax),%%xmm9		\n\t"/* maxerr */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x1a0(%%rax),%%xmm1		\n\t	mulpd	0x1b0(%%rax),%%xmm5		\n\t"/* x *= wi_re */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
			"movq	%[__sign_mask],%%rbx	\n\t	movaps		(%%rbx)	,	%%xmm8	\n\t"\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt_re */\
			"movaps		%%xmm1	,    (%%rax)\n\t	movaps		%%xmm5	,0x40(%%rax)\n\t"/* store x */\
		/* Update and store weights: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x1a0(%%rax),%%xmm3		\n\t	movaps	 0x1b0(%%rax),%%xmm7	\n\t"/* wi_re, inverse-wt_re */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_re *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_re *= inv_mult[i] */\
		"movaps	%%xmm2,0x180(%%rax)		\n\t	movaps	 %%xmm6,0x190(%%rax)	\n\t"/* Store wt_re */\
		"movaps	%%xmm3,0x1a0(%%rax)		\n\t	movaps	 %%xmm7,0x1b0(%%rax)	\n\t"/* Store wi_re */\
			/* Get ready for next set [IM0~] : */\
			"movq	%[__sse_n]	,%%rbx		\n\t"\
			"movaps		(%%rbx)	,%%xmm2		\n\t"\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"movaps		%%xmm0	,%%xmm1		\n\t"\
			"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
			"pand		%%xmm2	,%%xmm1		\n\t"\
			"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		/**********************************************/\
		/*          Imaginary parts                   */\
		/**********************************************/\
		"movslq		%[__p1],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x10(%%rax)	,%%xmm1	\n\t	movaps	 0x50(%%rax)	,%%xmm5	\n\t"/* I0~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x1c0(%%rax),%%xmm2		\n\t	movaps	 0x1d0(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x1e0(%%rax),%%xmm1		\n\t	mulpd	0x1f0(%%rax),%%xmm5		\n\t"/* y *= wi_im */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = y */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(y) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* y - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(y-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* y = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* y*= wt_im */\
			"movaps		%%xmm1	,0x10(%%rax)\n\t	movaps		%%xmm5	,0x50(%%rax)\n\t"/* store y */\
		/* Update and store weights: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x1e0(%%rax),%%xmm3		\n\t	movaps	 0x1f0(%%rax),%%xmm7	\n\t"/* wi_im, inverse-wt_im */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_im *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_im *= inv_mult[i] */\
		"movaps	%%xmm2,0x1c0(%%rax)		\n\t	movaps	 %%xmm6,0x1d0(%%rax)	\n\t"/* Store wt_im */\
		"movaps	%%xmm3,0x1e0(%%rax)		\n\t	movaps	 %%xmm7,0x1f0(%%rax)	\n\t"/* Store wi_im */\
			/* Get ready for next set [RE1~] : */\
			"movq	%[__sse_n]	,%%rbx		\n\t"\
			"movaps		(%%rbx)	,%%xmm2		\n\t"\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"movaps		%%xmm0	,%%xmm1		\n\t"\
			"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
			"pand		%%xmm2	,%%xmm1		\n\t"\
			"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
	/**********************************************/\
	/*          Now do the (j+2) data:            */\
	/**********************************************/\
		"movslq		%[__p2],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
		/**********************************************/\
		/*          Real      parts                   */\
		/**********************************************/\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x20(%%rax)	,%%xmm1	\n\t	movaps	 0x60(%%rax)	,%%xmm5	\n\t"/* R1~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x200(%%rax),%%xmm2		\n\t	movaps	 0x210(%%rax),%%xmm6	\n\t"/* wt_re for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x220(%%rax),%%xmm1		\n\t	mulpd	0x230(%%rax),%%xmm5		\n\t"/* x *= wi_re */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = x */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(x) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* x - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(x-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* x = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* x*= wt_re */\
			"movaps		%%xmm1	,0x20(%%rax)\n\t	movaps		%%xmm5	,0x60(%%rax)\n\t"/* store x */\
		/* Update and store weights ... the (j+2) wts are in the 8 slots (half_arr+[0x200-0x270]) above the (j) ones: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x220(%%rax),%%xmm3		\n\t	movaps	 0x230(%%rax),%%xmm7	\n\t"/* wi_re, inverse-wt_re */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_re), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_re >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_re *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_re *= inv_mult[i] */\
		"movaps	%%xmm2,0x200(%%rax)		\n\t	movaps	 %%xmm6,0x210(%%rax)	\n\t"/* Store wt_re */\
		"movaps	%%xmm3,0x220(%%rax)		\n\t	movaps	 %%xmm7,0x230(%%rax)	\n\t"/* Store wi_re */\
			/* Get ready for next set [IM0~] : */\
			"movq	%[__sse_n]	,%%rbx		\n\t"\
			"movaps		(%%rbx)	,%%xmm2		\n\t"\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"movaps		%%xmm0	,%%xmm1		\n\t"\
			"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
			"pand		%%xmm2	,%%xmm1		\n\t"\
			"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
		/**********************************************/\
		/*          Imaginary parts                   */\
		/**********************************************/\
		"movslq		%[__p3],%%r15	\n\t"\
		"prefetcht0	(%%r14,%%r15,8)	\n\t"\
			"movq	%[__sse_sw]	,	%%rbx	\n\t"\
			"movaps		(%%rbx)	,	%%xmm7	\n\t"/* sw[0:3] */\
			"psubd		%%xmm0	,	%%xmm7	\n\t"/* sw[0:3] - bjmod[0:3] */\
			"movmskps	%%xmm7	,	%%rsi	\n\t"/* Extract sign bits into 4-bit signmask */\
			"movq	%[__data]	,%%rax		\n\t"\
			"movaps	 0x30(%%rax)	,%%xmm1	\n\t	movaps	 0x70(%%rax)	,%%xmm5	\n\t"/* I1~ */\
			"movq	%[__half_arr]	,%%rax	\n\t"\
			"movaps	0x240(%%rax),%%xmm2		\n\t	movaps	 0x250(%%rax),%%xmm6	\n\t"/* wt_im for our 2 independent carry-chain pairs */\
		/*	"movaps	-0x10(%%rax),%%xmm4		\n\t"// sse2_rnd already in xmm4 */\
			"movq	%[__cyA]	,%%rdi		\n\t	movq	%[__cyB]	,%%rdx		\n\t"/* cy_in */\
			"mulpd	0x260(%%rax),%%xmm1		\n\t	mulpd	0x270(%%rax),%%xmm5		\n\t"/* y *= wi_im */\
			"movaps		%%xmm1	,%%xmm3		\n\t	movaps		%%xmm5	,%%xmm7		\n\t"/* temp = y */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* temp = DNINT(y) */\
			"subpd			%%xmm3	,%%xmm1	\n\t	subpd			%%xmm7	,%%xmm5	\n\t"/* y - temp */\
			"andpd			%%xmm8	,%%xmm1	\n\t	andpd			%%xmm8	,%%xmm5	\n\t"/* frac = fabs(y-temp) */\
			"maxpd			%%xmm1	,%%xmm9	\n\t	maxpd			%%xmm5	,%%xmm9	\n\t"/* if(frac > maxerr) maxerr=frac */\
			"mulpd		%%xmm10,%%xmm3		\n\t	mulpd		%%xmm10,%%xmm7		\n\t"/* temp = temp*prp_mult */\
			"addpd		(%%rdi),%%xmm3		\n\t	addpd		(%%rdx),%%xmm7		\n\t"/* temp = temp*prp_mult + cy */\
			"movaps		%%xmm9,-0x20(%%rax)	\n\t"/* Store maxerr */\
			"movq	%%rsi,	%%rbx													\n\t"/* byte offsets for bits <0:1> */\
			"shlq	 $4,	%%rbx			\n\t	shlq	 $2,	%%rsi			\n\t"/* and <2:3> go into rbx, rsi */\
			"andq	$0x30,	%%rbx			\n\t	andq	$0x30,	%%rsi			\n\t"/* respectively. */\
			"addq	%%rax,	%%rbx			\n\t	addq	%%rax,	%%rsi			\n\t"\
			"movaps		%%xmm3	,%%xmm1		\n\t	movaps		%%xmm7	,%%xmm5		\n\t"/* cpy temp */\
			"mulpd		0xc0(%%rbx),%%xmm3	\n\t	mulpd		0xc0(%%rsi),%%xmm7	\n\t"/* temp*baseinv[i] */\
			"addpd		%%xmm4	,%%xmm3		\n\t	addpd		%%xmm4	,%%xmm7		\n\t"\
			"subpd		%%xmm4	,%%xmm3		\n\t	subpd		%%xmm4	,%%xmm7		\n\t"/* cy_out */\
			"movaps		%%xmm3	,(%%rdi)	\n\t	movaps		%%xmm7	,(%%rdx)	\n\t"/* store cy_out */\
			"movq	%[__data]	,%%rax		\n\t"\
			"mulpd	 0x80(%%rbx),	%%xmm3	\n\t	mulpd	 0x80(%%rsi),	%%xmm7	\n\t"/* cy*base[i] */\
			"subpd		%%xmm3	,	%%xmm1	\n\t	subpd		%%xmm7	,	%%xmm5	\n\t"/* y = (temp-cy*base[i]) */\
			"mulpd		%%xmm2	,	%%xmm1	\n\t	mulpd		%%xmm6	,	%%xmm5	\n\t"/* y*= wt_im */\
			"movaps		%%xmm1	,0x30(%%rax)\n\t	movaps		%%xmm5	,0x70(%%rax)\n\t"/* store y */\
		/* Update and store weights ... the (j+2) wts are in the 8 slots (half_arr+[0x200-0x270]) above the (j) ones: */\
		"movq	%[__half_arr]	,%%rax	\n\t"\
		"movaps	0x260(%%rax),%%xmm3		\n\t	movaps	 0x270(%%rax),%%xmm7	\n\t"/* wi_im, inverse-wt_im */\
		"movaps	 0x170(%%rax),%%xmm1	\n\t	movaps		%%xmm1,%%xmm5		\n\t"/* inv_mult[1], 2 copies */\
		/* Do compare as (inv_mult[1] < wt_im), result overwrites copy of inv_mult[1]: */\
		"cmppd	$1,%%xmm2,%%xmm1		\n\t	cmppd	$1,%%xmm6,%%xmm5		\n\t"/* i = (wt_im >= inv_mult[1]) */\
		"movmskpd	%%xmm1,%%rdi		\n\t	movmskpd	%%xmm5,%%rdx		\n\t"/* Extract cmp-results into pair of 2-bit signmasks */\
		"shlq			$4,%%rdi		\n\t	shlq			$4,%%rdx		\n\t"/* ...and mpy by xmm bytewidth. */\
		"leaq 0x100(%%rax,%%rdi),%%rdi	\n\t	leaq 0x100(%%rax,%%rdx),%%rdx	\n\t"/* address = half_arr + i */\
		"mulpd	     (%%rdi),%%xmm2		\n\t	mulpd	     (%%rdx),%%xmm6		\n\t"/* wt_im *= wts_mult[i] */\
		"mulpd	 0x40(%%rdi),%%xmm3		\n\t	mulpd	 0x40(%%rdx),%%xmm7		\n\t"/* wi_im *= inv_mult[i] */\
		"movaps	%%xmm2,0x240(%%rax)		\n\t	movaps	 %%xmm6,0x250(%%rax)	\n\t"/* Store wt_im */\
		"movaps	%%xmm3,0x260(%%rax)		\n\t	movaps	 %%xmm7,0x270(%%rax)	\n\t"/* Store wi_im */\
			/* Get ready for next set [RE1~] : */\
			"movq	%[__sse_n]	,%%rbx		\n\t"\
			"movaps		(%%rbx)	,%%xmm2		\n\t"\
			"movq	%[__sse_bw]	,%%rax		\n\t"\
			"paddd		(%%rax)	,%%xmm0		\n\t"/* bjmod[0:3] += bw  */\
			"movaps		%%xmm0	,%%xmm1		\n\t"\
			"pcmpgtd	%%xmm2	,%%xmm1		\n\t"/* if(n > bjmod[0:3]) xmm1 = 11...11 */\
			"pand		%%xmm2	,%%xmm1		\n\t"\
			"psubd		%%xmm1	,%%xmm0		\n\t"/* if(n > bjmod[0:3]) bjmod[0:3] -= n */\
			"movq	%[__bjmod_0],%%rdi		\n\t"\
			"movaps	%%xmm0,(%%rdi)			\n\t"/* Write bjmod[0:3] */\
		/**********************************************/\
		/*              Repack the data:              */\
		/**********************************************/\
			"movq	%[__data],%%rax			\n\t"\
			"movaps		0x10(%%rax)	,%%xmm1	\n\t	movaps		0x50(%%rax)	,%%xmm5	\n\t"/* reload a[jp+p0 ] */\
			"movaps		0x30(%%rax)	,%%xmm0	\n\t	movaps		0x70(%%rax)	,%%xmm4	\n\t"\
			"movaps		%%xmm1		,%%xmm3	\n\t	movaps		%%xmm5		,%%xmm7	\n\t"/* cpy a[jp    ] */\
			"unpcklpd		%%xmm0	,%%xmm1	\n\t	unpcklpd		%%xmm4	,%%xmm5	\n\t"\
			"unpckhpd		%%xmm0	,%%xmm3	\n\t	unpckhpd		%%xmm4	,%%xmm7	\n\t"\
			"movaps		%%xmm1,0x10(%%rax)	\n\t	movaps		%%xmm5,0x50(%%rax)	\n\t"/* store a[jp+p0 ] */\
			"movaps		%%xmm3,0x30(%%rax)	\n\t	movaps		%%xmm7,0x70(%%rax)	\n\t"/* Store hi imag in aj2 */\
			"movaps		    (%%rax)	,%%xmm0	\n\t	movaps		0x40(%%rax)	,%%xmm4	\n\t"/* reload a[jt+p0 ] */\
			"movaps		0x20(%%rax)	,%%xmm1	\n\t	movaps		0x60(%%rax)	,%%xmm5	\n\t"\
			"movaps		%%xmm0		,%%xmm2	\n\t	movaps		%%xmm4		,%%xmm6	\n\t"/* cpy a[jt    ] */\
			"unpckhpd		%%xmm1	,%%xmm2	\n\t	unpckhpd		%%xmm5	,%%xmm6	\n\t"\
			"unpcklpd		%%xmm1	,%%xmm0	\n\t	unpcklpd		%%xmm5	,%%xmm4	\n\t"\
			"movaps		%%xmm2,0x20(%%rax)	\n\t	movaps		%%xmm6,0x60(%%rax)	\n\t"/* Store hi real in aj2 */\
			"movaps		%%xmm0,    (%%rax)	\n\t	movaps		%%xmm4,0x40(%%rax)	\n\t"/* store a[jt+p0 ] */\
		:					/* outputs: none */\
		: [__data]		"m" (Xdata)	/* All inputs from memory addresses here */\
		, [__cyA]		"m" (XcyA)		\
		, [__cyB]		"m" (XcyB)		\
		, [__bjmod_0]	"m" (Xbjmod_0)		\
		, [__i]			"m" (Xi)			\
		, [__half_arr]	"m" (Xhalf_arr)		\
		, [__sign_mask]	"m" (Xsign_mask)	\
		, [__sse_bw]	"m" (Xsse_bw)		\
		, [__sse_n]		"m" (Xsse_n)		\
		, [__sse_sw]	"m" (Xsse_sw)		\
		/* Prefetch: base address and 3 index offsets */\
		,	[__add0] "m" (Xadd0)\
		,	[__p1]   "m" (Xp1)\
		,	[__p2]   "m" (Xp2)\
		,	[__p3]   "m" (Xp3)\
		/* Mar 2018: Needed to support PRP testing: */\
		,	[__prp_mult]   "m" (Xprp_mult)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10"	/* Clobbered registers */\
	);\
	}

#endif	// !defined(USE_ARM_V8_SIMD)

#endif	/* carry_gcc_h_included */

